diff --git a/package-lock.json b/package-lock.json index 3e439a7..5383681 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,8 @@ "packages/ai", "packages/agent", "packages/ptywright", - "packages/cli" + "packages/cli", + "packages/bench" ], "devDependencies": { "@types/node": "22.18.4", @@ -3455,6 +3456,10 @@ "resolved": "packages/ai", "link": true }, + "node_modules/@onkernel/cua-bench": { + "resolved": "packages/bench", + "link": true + }, "node_modules/@onkernel/cua-cli": { "resolved": "packages/cli", "link": true @@ -6100,6 +6105,19 @@ "vitest": "^3.2.4" } }, + "packages/bench": { + "name": "@onkernel/cua-bench", + "version": "0.0.0", + "license": "MIT", + "dependencies": { + "@onkernel/cua-agent": "*", + "@onkernel/cua-ai": "*", + "@onkernel/sdk": "0.49.0" + }, + "devDependencies": { + "tsx": "^4.21.0" + } + }, "packages/cli": { "name": "@onkernel/cua-cli", "version": "0.1.4", diff --git a/package.json b/package.json index 1f5397e..7c2266d 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,8 @@ "packages/ai", "packages/agent", "packages/ptywright", - "packages/cli" + "packages/cli", + "packages/bench" ], "scripts": { "build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present", diff --git a/packages/bench/.gitignore b/packages/bench/.gitignore new file mode 100644 index 0000000..ef55106 --- /dev/null +++ b/packages/bench/.gitignore @@ -0,0 +1,2 @@ +results/ +tasks/ diff --git a/packages/bench/package.json b/packages/bench/package.json new file mode 100644 index 0000000..bda0753 --- /dev/null +++ b/packages/bench/package.json @@ -0,0 +1,27 @@ +{ + "name": "@onkernel/cua-bench", + "version": "0.0.0", + "description": "Benchmark runner for CUA models on Kernel cloud browsers", + "license": "MIT", + "type": "module", + "private": true, + "exports": { + ".": { + "types": "./dist-tsc/index.d.ts", + "source": "./src/index.ts" + } + }, + "scripts": { + "bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts", + "aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts", + "typecheck": "tsc -b" + }, + "dependencies": { + "@onkernel/cua-agent": "*", + "@onkernel/cua-ai": "*", + "@onkernel/sdk": "0.49.0" + }, + "devDependencies": { + "tsx": "^4.21.0" + } +} diff --git a/packages/bench/scripts/fetch-tasks.py b/packages/bench/scripts/fetch-tasks.py new file mode 100755 index 0000000..8403649 --- /dev/null +++ b/packages/bench/scripts/fetch-tasks.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +"""Fetch Online-Mind2Web tasks into the JSON the TS harness reads. + +The dataset is gated, so this needs an HF token (HF_TOKEN env, or `huggingface-cli login`). + + pip install datasets + HF_TOKEN=hf_... python scripts/fetch-tasks.py --out tasks/online-mind2web-test.json +""" +import argparse +import json +import os +from pathlib import Path + +from datasets import load_dataset + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--out", default="tasks/online-mind2web-test.json") + parser.add_argument("--split", default="test") + args = parser.parse_args() + + ds = load_dataset("osunlp/Online-Mind2Web", split=args.split, token=os.environ.get("HF_TOKEN")) + tasks = [ + { + "task_id": row["task_id"], + "website": row["website"], + "confirmed_task": row["confirmed_task"], + "reference_length": int(row["reference_length"]) if row.get("reference_length") is not None else 1, + } + for row in ds + ] + + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(tasks, indent=2)) + print(f"wrote {len(tasks)} tasks to {out}") + + +if __name__ == "__main__": + main() diff --git a/packages/bench/scripts/run-webjudge.sh b/packages/bench/scripts/run-webjudge.sh new file mode 100755 index 0000000..f56801f --- /dev/null +++ b/packages/bench/scripts/run-webjudge.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Score benchmark trajectories with the OFFICIAL Online-Mind2Web WebJudge. +# +# Clones the upstream OSU-NLP repo and runs its WebJudge over each model's +# trajectories (which the harness already wrote in the official v2 schema), +# then normalizes the output to /webjudge.jsonl for the aggregator. +# +# OPENAI_API_KEY=... scripts/run-webjudge.sh results [judge-model] [score-threshold] +set -euo pipefail + +RESULTS_DIR="$(cd "${1:-results}" && pwd)" +JUDGE_MODEL="${2:-o4-mini}" +THRESHOLD="${3:-3}" +: "${OPENAI_API_KEY:?OPENAI_API_KEY is required for WebJudge}" + +WORKDIR="$(mktemp -d)" +trap 'rm -rf "$WORKDIR"' EXIT +git clone --depth 1 https://github.com/OSU-NLP-Group/Online-Mind2Web "$WORKDIR/om2w" +pip install -q -r "$WORKDIR/om2w/requirements.txt" + +for MODEL_DIR in "$RESULTS_DIR"/*/; do + [ -d "$MODEL_DIR" ] || continue + MODEL_DIR="${MODEL_DIR%/}" + echo "== WebJudge: $MODEL_DIR ==" + ( cd "$WORKDIR/om2w/src" && python run.py \ + --mode WebJudge_Online_Mind2Web_eval \ + --model "$JUDGE_MODEL" \ + --trajectories_dir "$MODEL_DIR" \ + --api_key "$OPENAI_API_KEY" \ + --output_path "$MODEL_DIR" \ + --score_threshold "$THRESHOLD" ) + OUT="$MODEL_DIR/WebJudge_Online_Mind2Web_eval_${JUDGE_MODEL}_score_threshold_${THRESHOLD}_auto_eval_results.json" + [ -f "$OUT" ] && cp "$OUT" "$MODEL_DIR/webjudge.jsonl" +done + +echo "WebJudge complete — aggregate with: npm run aggregate" diff --git a/packages/bench/src/aggregate.ts b/packages/bench/src/aggregate.ts new file mode 100644 index 0000000..2398107 --- /dev/null +++ b/packages/bench/src/aggregate.ts @@ -0,0 +1,120 @@ +import { readdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { ModelSummary, TaskMetrics } from "./types"; + +/** + * Roll per-task results into one ModelSummary per model. Cost/speed come from + * the `metrics.json` sidecars; accuracy comes from an optional WebJudge output + * (`/webjudge.jsonl`, one `{task_id, predicted_label}` per line) written + * by `scripts/run-webjudge.sh`. Accuracy is null until that file exists. + * + * The accuracy denominator is every task the model attempted — completed runs + * plus tasks that exhausted their retries without producing a trajectory — so a + * task the model can never finish counts as a failure rather than silently + * dropping out of the rate. + */ +export async function aggregate(outDir: string): Promise { + const summaries: ModelSummary[] = []; + const modelDirs = await readdir(outDir, { withFileTypes: true }); + + for (const entry of modelDirs) { + if (!entry.isDirectory()) continue; + const modelDir = join(outDir, entry.name); + const metrics = await readMetrics(modelDir); + if (metrics.length === 0) continue; + + const failed = await countExhaustedTasks(modelDir); + const attempted = metrics.length + failed; + const judged = await readJudgements(join(modelDir, "webjudge.jsonl")); + const costs = metrics.map((m) => m.costUsd).filter((c): c is number => c !== null); + const passed = judged ? metrics.filter((m) => judged.get(m.task_id) === true).length : null; + + summaries.push({ + model: metrics[0]!.model, + tasks: attempted, + passed, + accuracyPct: judged ? round((passed! / attempted) * 100, 1) : null, + avgCostUsd: costs.length ? round(sum(costs) / costs.length, 4) : null, + avgSpeedSec: round(sum(metrics.map((m) => m.wallClockMs)) / metrics.length / 1000, 1), + }); + } + + await writeFile(join(outDir, "summary.json"), `${JSON.stringify(summaries, null, 2)}\n`); + printTable(summaries); + return summaries; +} + +async function readMetrics(modelDir: string): Promise { + const out: TaskMetrics[] = []; + for (const entry of await readdir(modelDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + try { + out.push(JSON.parse(await readFile(join(modelDir, entry.name, "metrics.json"), "utf8"))); + } catch { + // task dir without a finished metrics.json — not yet run + } + } + return out; +} + +/** Count tasks that recorded retry attempts but never produced a trajectory — permanent failures. */ +async function countExhaustedTasks(modelDir: string): Promise { + let count = 0; + for (const entry of await readdir(modelDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const taskDir = join(modelDir, entry.name); + const hasMetrics = await readFile(join(taskDir, "metrics.json"), "utf8").then(() => true).catch(() => false); + if (hasMetrics) continue; + const hasAttempts = await readFile(join(taskDir, "attempts"), "utf8").then(() => true).catch(() => false); + if (hasAttempts) count++; + } + return count; +} + +async function readJudgements(path: string): Promise | undefined> { + let raw: string; + try { + raw = await readFile(path, "utf8"); + } catch { + return undefined; + } + const map = new Map(); + for (const line of raw.split("\n")) { + if (!line.trim()) continue; + const row = JSON.parse(line) as { task_id: string; predicted_label: unknown }; + map.set(row.task_id, isPass(row.predicted_label)); + } + return map; +} + +function isPass(label: unknown): boolean { + if (typeof label === "number") return label === 1; + if (typeof label === "boolean") return label; + if (typeof label === "string") return ["1", "success", "yes", "true"].includes(label.toLowerCase()); + return false; +} + +function printTable(summaries: ModelSummary[]): void { + console.log("\nmodel\taccuracy\tcost/task\tspeed"); + for (const s of summaries) { + const acc = s.accuracyPct === null ? "—" : `${s.accuracyPct}%`; + const cost = s.avgCostUsd === null ? "—" : `$${s.avgCostUsd}`; + console.log(`${s.model}\t${acc}\t${cost}\t${s.avgSpeedSec}s`); + } +} + +function sum(xs: number[]): number { + return xs.reduce((a, b) => a + b, 0); +} + +function round(x: number, places: number): number { + const f = 10 ** places; + return Math.round(x * f) / f; +} + +if (process.argv[1]?.endsWith("aggregate.ts")) { + aggregate(process.argv[2] ?? "results").catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/packages/bench/src/benchmark.ts b/packages/bench/src/benchmark.ts new file mode 100644 index 0000000..b0ec7fd --- /dev/null +++ b/packages/bench/src/benchmark.ts @@ -0,0 +1,127 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; +import { access, mkdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { createKernelClient, DEFAULT_BROWSER_SETTINGS } from "./browser"; +import { runPool } from "./pool"; +import { modelSlug, runOne } from "./runOne"; +import { loadTasks } from "./tasks"; + +const DEFAULT_MODELS: CuaModelRef[] = [ + "anthropic:claude-opus-4-6", + "openai:gpt-5.5", + "google:gemini-3-flash-preview", +]; + +/** + * How many times a task may error before it's treated as a permanent failure + * for that model and stops being retried. Some tasks fail deterministically + * (e.g. a model whose trajectory grows past the provider's max request size), + * so without a cap a resumable run would retry them forever. + */ +const MAX_ATTEMPTS = 3; + +interface Options { + tasksPath: string; + outDir: string; + limit?: number; + concurrency: number; + models: CuaModelRef[]; +} + +function parseArgs(argv: string[]): Options { + const opts: Options = { + tasksPath: "tasks/online-mind2web-test.json", + outDir: "results", + concurrency: 5, + models: DEFAULT_MODELS, + }; + for (let i = 0; i < argv.length; i++) { + const value = () => argv[++i] ?? ""; + switch (argv[i]) { + case "--tasks": + opts.tasksPath = value(); + break; + case "--out": + opts.outDir = value(); + break; + case "--limit": + opts.limit = Number(value()); + break; + case "--concurrency": + opts.concurrency = Number(value()); + break; + case "--models": + opts.models = value().split(",").map((s) => s.trim()) as CuaModelRef[]; + break; + } + } + return opts; +} + +async function exists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +async function readAttempts(taskDir: string): Promise { + try { + return Number.parseInt(await readFile(join(taskDir, "attempts"), "utf8"), 10) || 0; + } catch { + return 0; + } +} + +async function recordAttempt(taskDir: string, count: number): Promise { + await mkdir(taskDir, { recursive: true }); + await writeFile(join(taskDir, "attempts"), String(count)); +} + +async function main(): Promise { + const opts = parseArgs(process.argv.slice(2)); + const client = createKernelClient(); + const tasks = await loadTasks(opts.tasksPath, opts.limit); + console.log(`[bench] ${tasks.length} tasks × ${opts.models.length} models, concurrency ${opts.concurrency}`); + + for (const model of opts.models) { + const slug = modelSlug(model); + console.log(`[bench] === ${model} ===`); + let done = 0; + let failed = 0; + let skipped = 0; + let exhausted = 0; + await runPool(tasks, opts.concurrency, async (task) => { + const taskDir = join(opts.outDir, slug, task.task_id); + if (await exists(join(taskDir, "result.json"))) { + skipped++; + return; + } + const attempts = await readAttempts(taskDir); + if (attempts >= MAX_ATTEMPTS) { + exhausted++; + return; + } + try { + const m = await runOne(client, model, task, DEFAULT_BROWSER_SETTINGS, taskDir); + done++; + console.log(`[bench] ${slug} ${task.task_id} ok steps=${m.steps} ${(m.wallClockMs / 1000).toFixed(1)}s`); + } catch (err) { + await recordAttempt(taskDir, attempts + 1); + failed++; + console.error(`[bench] ${slug} ${task.task_id} FAILED (attempt ${attempts + 1}/${MAX_ATTEMPTS}): ${(err as Error).message}`); + } + }); + console.log(`[bench] ${slug}: done=${done} skipped=${skipped} failed=${failed} exhausted=${exhausted}`); + } + + console.log(`[bench] complete — results in ${opts.outDir}/`); + console.log("[bench] next: score with scripts/run-webjudge.sh, then aggregate with src/aggregate.ts"); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/bench/src/browser.ts b/packages/bench/src/browser.ts new file mode 100644 index 0000000..01f5b18 --- /dev/null +++ b/packages/bench/src/browser.ts @@ -0,0 +1,53 @@ +import type { KernelBrowser } from "@onkernel/cua-agent"; +import Kernel from "@onkernel/sdk"; + +/** Browser configuration held constant across every model so the only variable is the model. */ +export interface BrowserSettings { + stealth: boolean; + viewport: { width: number; height: number }; + timeoutSeconds: number; +} + +/** Benchmark defaults: stealth on, fresh unauthenticated profile, generous timeout. */ +export const DEFAULT_BROWSER_SETTINGS: BrowserSettings = { + stealth: true, + viewport: { width: 1280, height: 800 }, + timeoutSeconds: 600, +}; + +export interface BrowserHandle { + client: Kernel; + browser: KernelBrowser; + close(): Promise; +} + +export function createKernelClient(apiKey?: string): Kernel { + const key = apiKey ?? process.env.KERNEL_API_KEY; + if (!key) throw new Error("KERNEL_API_KEY is required"); + return new Kernel({ apiKey: key }); +} + +/** Provision a fresh Kernel browser under the given settings. */ +export async function provisionBrowser(client: Kernel, settings: BrowserSettings): Promise { + const browser = await client.browsers.create({ + stealth: settings.stealth, + viewport: settings.viewport, + timeout_seconds: settings.timeoutSeconds, + }); + return { + client, + browser, + close: async () => { + await client.browsers.deleteByID(browser.session_id).catch(() => {}); + }, + }; +} + +export async function captureScreenshot(client: Kernel, sessionId: string): Promise { + try { + const response = await client.browsers.computer.captureScreenshot(sessionId); + return Buffer.from(await response.arrayBuffer()); + } catch { + return undefined; + } +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts new file mode 100644 index 0000000..e214a07 --- /dev/null +++ b/packages/bench/src/index.ts @@ -0,0 +1,19 @@ +export { runOne, modelSlug } from "./runOne"; +export { loadTasks } from "./tasks"; +export { aggregate } from "./aggregate"; +export { runPool } from "./pool"; +export { recordTrajectory } from "./trajectory"; +export { + type BrowserSettings, + DEFAULT_BROWSER_SETTINGS, + provisionBrowser, + createKernelClient, +} from "./browser"; +export type { + ActionStep, + ModelSummary, + Om2wResult, + Om2wTask, + TaskMetrics, + TokenTotals, +} from "./types"; diff --git a/packages/bench/src/pool.ts b/packages/bench/src/pool.ts new file mode 100644 index 0000000..89aa66c --- /dev/null +++ b/packages/bench/src/pool.ts @@ -0,0 +1,16 @@ +/** Run `worker` over `items` with at most `concurrency` in flight at once. */ +export async function runPool( + items: T[], + concurrency: number, + worker: (item: T, index: number) => Promise, +): Promise { + let next = 0; + const lanes = Array.from({ length: Math.min(Math.max(1, concurrency), items.length) }, async () => { + while (true) { + const index = next++; + if (index >= items.length) return; + await worker(items[index]!, index); + } + }); + await Promise.all(lanes); +} diff --git a/packages/bench/src/runOne.ts b/packages/bench/src/runOne.ts new file mode 100644 index 0000000..7631805 --- /dev/null +++ b/packages/bench/src/runOne.ts @@ -0,0 +1,124 @@ +import { CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; +import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; +import type Kernel from "@onkernel/sdk"; +import { mkdir, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { type BrowserSettings, captureScreenshot, provisionBrowser } from "./browser"; +import { recordTrajectory, type TrajectoryRecording } from "./trajectory"; +import type { ActionStep, Om2wResult, Om2wTask, TaskMetrics } from "./types"; + +/** Filesystem-safe slug for a provider-qualified model ref like `anthropic:claude-opus-4-6`. */ +export function modelSlug(model: CuaModelRef): string { + return model.replace(/[^a-zA-Z0-9._-]/g, "-"); +} + +function buildPrompt(task: Om2wTask): string { + if (task.website && !task.confirmed_task.toLowerCase().includes(task.website.toLowerCase())) { + return `Go to ${task.website} and ${task.confirmed_task}`; + } + return task.confirmed_task; +} + +/** + * Run one Online-Mind2Web task on one model against a fresh Kernel browser and + * write the official v2 trajectory (`result.json` + `trajectory/`) plus a + * `metrics.json` cost/speed sidecar into `taskDir`. + */ +export async function runOne( + client: Kernel, + model: CuaModelRef, + task: Om2wTask, + settings: BrowserSettings, + taskDir: string, +): Promise { + const handle = await provisionBrowser(client, settings); + const cwd = process.cwd(); + const repo = new JsonlSessionRepo({ + fs: new NodeExecutionEnv({ cwd }), + sessionsRoot: join(tmpdir(), "cua-bench", "sessions"), + }); + const session = await repo.create({ cwd }); + + const harness = new CuaAgentHarness({ + env: new NodeExecutionEnv({ cwd }), + session, + model, + browser: handle.browser, + client, + systemPrompt: ({ model: active }) => resolveCuaRuntimeSpec(active).defaultSystemPrompt, + getApiKeyAndHeaders: async (resolved) => { + const apiKey = getCuaEnvApiKey(resolved.provider); + return apiKey ? { apiKey } : undefined; + }, + }); + + const { recording, stop } = recordTrajectory(harness); + const startedAt = Date.now(); + let stopReason = "completed"; + let errorMessage: string | undefined; + try { + const shot = await captureScreenshot(handle.client, handle.browser.session_id); + const images: ImageContent[] | undefined = shot + ? [{ type: "image", data: shot.toString("base64"), mimeType: "image/png" }] + : undefined; + const assistant = await harness.prompt(buildPrompt(task), images ? { images } : undefined); + stopReason = assistant.stopReason; + if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { + errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; + } + } finally { + stop(); + await handle.close(); + } + + if (stopReason === "error" || stopReason === "aborted") { + // Throw instead of persisting: result.json is the resume sentinel, so writing + // one for a failed run would bake an empty trajectory into the scored set and + // permanently skip the retry. + throw new Error(errorMessage ?? `agent stopped with ${stopReason}`); + } + + const wallClockMs = Date.now() - startedAt; + const metrics: TaskMetrics = { + task_id: task.task_id, + model, + wallClockMs, + steps: recording.turns, + tokens: recording.tokens, + costUsd: recording.costUsd, + stopReason, + errorMessage, + }; + await writeArtifacts(taskDir, task, recording, metrics); + return metrics; +} + +async function writeArtifacts( + taskDir: string, + task: Om2wTask, + recording: TrajectoryRecording, + metrics: TaskMetrics, +): Promise { + const trajectoryDir = join(taskDir, "trajectory"); + await mkdir(trajectoryDir, { recursive: true }); + + const action_history: ActionStep[] = []; + for (let i = 0; i < recording.steps.length; i++) { + const step = recording.steps[i]!; + const screenshot = `${String(i).padStart(4, "0")}.png`; + await writeFile(join(trajectoryDir, screenshot), step.screenshot); + action_history.push({ step: i, screenshot, action: step.action, thought: step.thought, url: null }); + } + + const result: Om2wResult = { + schema_version: "online-mind2web-v2", + task: task.confirmed_task, + task_id: task.task_id, + agent_final_answer: recording.finalAnswer, + reference_length: task.reference_length, + action_history, + }; + await writeFile(join(taskDir, "result.json"), `${JSON.stringify(result, null, 2)}\n`); + await writeFile(join(taskDir, "metrics.json"), `${JSON.stringify(metrics, null, 2)}\n`); +} diff --git a/packages/bench/src/tasks.ts b/packages/bench/src/tasks.ts new file mode 100644 index 0000000..4a5c87a --- /dev/null +++ b/packages/bench/src/tasks.ts @@ -0,0 +1,18 @@ +import { readFile } from "node:fs/promises"; +import type { Om2wTask } from "./types"; + +/** + * Load Online-Mind2Web tasks from a local JSON file produced by + * `scripts/fetch-tasks.py` (the dataset is gated, so it's fetched with the + * official `datasets` loader rather than over HTTP). + */ +export async function loadTasks(path: string, limit?: number): Promise { + let raw: string; + try { + raw = await readFile(path, "utf8"); + } catch { + throw new Error(`task file not found at ${path} — generate it with: python scripts/fetch-tasks.py --out ${path}`); + } + const tasks = JSON.parse(raw) as Om2wTask[]; + return typeof limit === "number" ? tasks.slice(0, limit) : tasks; +} diff --git a/packages/bench/src/trajectory.ts b/packages/bench/src/trajectory.ts new file mode 100644 index 0000000..4acf128 --- /dev/null +++ b/packages/bench/src/trajectory.ts @@ -0,0 +1,100 @@ +import type { AgentHarnessEvent, CuaAgentHarness } from "@onkernel/cua-agent"; +import type { TokenTotals } from "./types"; + +/** A trajectory step before it's assigned a screenshot filename. */ +export interface RecordedStep { + action: string; + thought: string | null; + screenshot: Buffer; +} + +export interface TrajectoryRecording { + steps: RecordedStep[]; + finalAnswer: string | null; + tokens: TokenTotals; + costUsd: number | null; + turns: number; +} + +/** + * Subscribe to a running harness and accumulate the data WebJudge needs: + * one step per computer action that produced a screenshot, the agent's final + * answer, and summed token/cost usage. Returns the live recording plus an + * unsubscribe handle. + */ +export function recordTrajectory(harness: CuaAgentHarness): { + recording: TrajectoryRecording; + stop: () => void; +} { + const recording: TrajectoryRecording = { + steps: [], + finalAnswer: null, + tokens: { input: 0, output: 0, total: 0 }, + costUsd: null, + turns: 0, + }; + const pendingActions = new Map(); + let currentThought: string | null = null; + + const stop = harness.subscribe((event: AgentHarnessEvent) => { + switch (event.type) { + case "turn_start": + recording.turns += 1; + return; + case "message_end": { + if (event.message.role !== "assistant") return; + const text = textOf(event.message.content); + if (text) { + currentThought = text; + recording.finalAnswer = text; + } + const { usage } = event.message; + recording.tokens.input += usage.input; + recording.tokens.output += usage.output; + recording.tokens.total += usage.totalTokens; + if (usage.cost.total > 0) recording.costUsd = (recording.costUsd ?? 0) + usage.cost.total; + return; + } + case "tool_execution_start": + pendingActions.set(event.toolCallId, formatAction(event.toolName, event.args)); + return; + case "tool_execution_end": { + const action = pendingActions.get(event.toolCallId) ?? event.toolName; + pendingActions.delete(event.toolCallId); + const screenshot = screenshotOf(event.result); + if (screenshot) recording.steps.push({ action, thought: currentThought, screenshot }); + return; + } + default: + return; + } + }); + + return { recording, stop }; +} + +function formatAction(toolName: string, args: unknown): string { + const rendered = args && typeof args === "object" ? JSON.stringify(args) : String(args ?? ""); + return rendered ? `${toolName} ${rendered}` : toolName; +} + +function screenshotOf(result: unknown): Buffer | undefined { + const content = (result as { content?: Array<{ type?: string; data?: string }> } | undefined)?.content; + if (!content) return undefined; + for (const c of content) { + if (c?.type === "image" && typeof c.data === "string") return Buffer.from(c.data, "base64"); + } + return undefined; +} + +function textOf(content: unknown): string { + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + const parts: string[] = []; + for (const c of content) { + if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { + parts.push((c as { text: string }).text); + } + } + return parts.join("\n"); +} diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts new file mode 100644 index 0000000..c711463 --- /dev/null +++ b/packages/bench/src/types.ts @@ -0,0 +1,57 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; + +/** Token totals summed across every model call in a run. */ +export interface TokenTotals { + input: number; + output: number; + total: number; +} + +/** A task from the osunlp/Online-Mind2Web dataset. */ +export interface Om2wTask { + task_id: string; + website: string; + confirmed_task: string; + reference_length: number; +} + +/** One step of an Online-Mind2Web v2 trajectory. */ +export interface ActionStep { + step: number; + screenshot: string; + action: string; + thought: string | null; + url: string | null; +} + +/** A result.json conforming to the official `online-mind2web-v2` submission schema. */ +export interface Om2wResult { + schema_version: "online-mind2web-v2"; + task: string; + task_id: string; + agent_final_answer: string | null; + reference_length: number; + action_history: ActionStep[]; +} + +/** Per-run cost/speed sidecar, kept out of result.json so the latter stays schema-pure. */ +export interface TaskMetrics { + task_id: string; + model: CuaModelRef; + wallClockMs: number; + steps: number; + tokens: TokenTotals; + costUsd: number | null; + stopReason: string; + errorMessage?: string; +} + +/** Aggregated accuracy/cost/speed for one model — the numbers that fill the page. */ +export interface ModelSummary { + model: CuaModelRef; + tasks: number; + passed: number | null; + accuracyPct: number | null; + avgCostUsd: number | null; + avgSpeedSec: number; +} diff --git a/packages/bench/tsconfig.build.json b/packages/bench/tsconfig.build.json new file mode 100644 index 0000000..7802bbd --- /dev/null +++ b/packages/bench/tsconfig.build.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist-tsc", + "rootDir": "./src", + "emitDeclarationOnly": true, + "sourceMap": false, + "declarationMap": false + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"], + "references": [{ "path": "../ai" }, { "path": "../agent" }] +} diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json new file mode 100644 index 0000000..d8faaf5 --- /dev/null +++ b/packages/bench/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "./tsconfig.build.json" +} diff --git a/tsconfig.json b/tsconfig.json index aad9af1..6166f1e 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,6 +4,7 @@ { "path": "./packages/ai" }, { "path": "./packages/agent" }, { "path": "./packages/ptywright" }, - { "path": "./packages/cli" } + { "path": "./packages/cli" }, + { "path": "./packages/bench" } ] }