From c7c3883a2f1b467d6c2ba8f59659e084147463eb Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:32:12 +0000 Subject: [PATCH 1/5] Add packages/bench with a single-task CUA model runner Introduce a private @onkernel/cua-bench workspace that runs one task on one model against a fresh Kernel browser via CuaAgentHarness, capturing wall-clock, turn count, and token totals. Accuracy scoring and cost conversion are left unscored for follow-up work. Includes a spike entrypoint for a manual run. --- package-lock.json | 20 ++++- package.json | 3 +- packages/bench/package.json | 26 ++++++ packages/bench/src/index.ts | 2 + packages/bench/src/runTask.ts | 128 +++++++++++++++++++++++++++++ packages/bench/src/spike.ts | 21 +++++ packages/bench/src/types.ts | 31 +++++++ packages/bench/tsconfig.build.json | 13 +++ packages/bench/tsconfig.json | 3 + tsconfig.json | 3 +- 10 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 packages/bench/package.json create mode 100644 packages/bench/src/index.ts create mode 100644 packages/bench/src/runTask.ts create mode 100644 packages/bench/src/spike.ts create mode 100644 packages/bench/src/types.ts create mode 100644 packages/bench/tsconfig.build.json create mode 100644 packages/bench/tsconfig.json diff --git a/package-lock.json b/package-lock.json index 3e439a7..5383681 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,8 @@ "packages/ai", "packages/agent", "packages/ptywright", - "packages/cli" + "packages/cli", + "packages/bench" ], "devDependencies": { "@types/node": "22.18.4", @@ -3455,6 +3456,10 @@ "resolved": "packages/ai", "link": true }, + "node_modules/@onkernel/cua-bench": { + "resolved": "packages/bench", + "link": true + }, "node_modules/@onkernel/cua-cli": { "resolved": "packages/cli", "link": true @@ -6100,6 +6105,19 @@ "vitest": "^3.2.4" } }, + "packages/bench": { + "name": "@onkernel/cua-bench", + "version": "0.0.0", + "license": "MIT", + "dependencies": { + "@onkernel/cua-agent": "*", + "@onkernel/cua-ai": "*", + "@onkernel/sdk": "0.49.0" + }, + "devDependencies": { + "tsx": "^4.21.0" + } + }, "packages/cli": { "name": "@onkernel/cua-cli", "version": "0.1.4", diff --git a/package.json b/package.json index 1f5397e..7c2266d 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,8 @@ "packages/ai", "packages/agent", "packages/ptywright", - "packages/cli" + "packages/cli", + "packages/bench" ], "scripts": { "build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present", diff --git a/packages/bench/package.json b/packages/bench/package.json new file mode 100644 index 0000000..a67fb8b --- /dev/null +++ b/packages/bench/package.json @@ -0,0 +1,26 @@ +{ + "name": "@onkernel/cua-bench", + "version": "0.0.0", + "description": "Benchmark runner for CUA models on Kernel cloud browsers", + "license": "MIT", + "type": "module", + "private": true, + "exports": { + ".": { + "types": "./dist-tsc/index.d.ts", + "source": "./src/index.ts" + } + }, + "scripts": { + "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts", + "typecheck": "tsc -b" + }, + "dependencies": { + "@onkernel/cua-agent": "*", + "@onkernel/cua-ai": "*", + "@onkernel/sdk": "0.49.0" + }, + "devDependencies": { + "tsx": "^4.21.0" + } +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts new file mode 100644 index 0000000..1b7489e --- /dev/null +++ b/packages/bench/src/index.ts @@ -0,0 +1,2 @@ +export { runTask, type RunTaskOptions } from "./runTask"; +export type { Task, TaskResult, TokenTotals } from "./types"; diff --git a/packages/bench/src/runTask.ts b/packages/bench/src/runTask.ts new file mode 100644 index 0000000..32356e2 --- /dev/null +++ b/packages/bench/src/runTask.ts @@ -0,0 +1,128 @@ +import { type AgentHarnessEvent, CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; +import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; +import Kernel from "@onkernel/sdk"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { Task, TaskResult, TokenTotals } from "./types"; + +export interface RunTaskOptions { + /** Kernel API key. Defaults to `KERNEL_API_KEY`. */ + kernelApiKey?: string; + /** Kernel browser session lifetime in seconds. Defaults to 300. */ + timeoutSeconds?: number; + /** Root directory for jsonl transcripts. Defaults to a temp dir. */ + sessionsRoot?: string; +} + +/** + * Run a single benchmark task on a single model against a fresh Kernel + * browser. Returns timing and token totals; `success` and `costUsd` are + * not scored here. + */ +export async function runTask( + modelRef: CuaModelRef, + task: Task, + options: RunTaskOptions = {}, +): Promise { + const kernelApiKey = options.kernelApiKey ?? process.env.KERNEL_API_KEY; + if (!kernelApiKey) throw new Error("KERNEL_API_KEY is required to run a benchmark task"); + + const client = new Kernel({ apiKey: kernelApiKey }); + const browser = await client.browsers.create({ + stealth: true, + timeout_seconds: options.timeoutSeconds && options.timeoutSeconds > 0 ? options.timeoutSeconds : 300, + }); + + const cwd = process.cwd(); + const repo = new JsonlSessionRepo({ + fs: new NodeExecutionEnv({ cwd }), + sessionsRoot: options.sessionsRoot ?? join(tmpdir(), "cua-bench", "sessions"), + }); + const session = await repo.create({ cwd }); + + const tokens: TokenTotals = { input: 0, output: 0, total: 0 }; + let costUsd: number | null = null; + let steps = 0; + + const harness = new CuaAgentHarness({ + env: new NodeExecutionEnv({ cwd }), + session, + model: modelRef, + browser, + client, + systemPrompt: ({ model }) => resolveCuaRuntimeSpec(model).defaultSystemPrompt, + getApiKeyAndHeaders: async (resolved) => { + const apiKey = getCuaEnvApiKey(resolved.provider); + return apiKey ? { apiKey } : undefined; + }, + }); + + const unsubscribe = harness.subscribe((event: AgentHarnessEvent) => { + if (event.type === "turn_start") { + steps += 1; + return; + } + if (event.type === "message_end" && event.message.role === "assistant") { + const { usage } = event.message; + tokens.input += usage.input; + tokens.output += usage.output; + tokens.total += usage.totalTokens; + if (usage.cost.total > 0) costUsd = (costUsd ?? 0) + usage.cost.total; + } + }); + + const startedAt = Date.now(); + let stopReason = "completed"; + let finalText = ""; + let errorMessage: string | undefined; + try { + const screenshot = await captureScreenshot(client, browser.session_id); + const images: ImageContent[] | undefined = screenshot + ? [{ type: "image", data: screenshot, mimeType: "image/png" }] + : undefined; + const assistant = await harness.prompt(task.prompt, images ? { images } : undefined); + stopReason = assistant.stopReason; + finalText = textOf(assistant.content); + if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { + errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; + } + } finally { + unsubscribe(); + await client.browsers.deleteByID(browser.session_id).catch(() => {}); + } + + return { + model: modelRef, + taskId: task.id, + success: null, + stopReason, + finalText, + errorMessage, + wallClockMs: Date.now() - startedAt, + steps, + tokens, + costUsd, + }; +} + +async function captureScreenshot(client: Kernel, sessionId: string): Promise { + try { + const response = await client.browsers.computer.captureScreenshot(sessionId); + const arrayBuffer = await response.arrayBuffer(); + return Buffer.from(arrayBuffer).toString("base64"); + } catch { + return undefined; + } +} + +function textOf(content: unknown): string { + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + const parts: string[] = []; + for (const c of content) { + if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { + parts.push((c as { text: string }).text); + } + } + return parts.join("\n"); +} diff --git a/packages/bench/src/spike.ts b/packages/bench/src/spike.ts new file mode 100644 index 0000000..efd28dd --- /dev/null +++ b/packages/bench/src/spike.ts @@ -0,0 +1,21 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; +import { runTask } from "./runTask"; +import type { Task } from "./types"; + +const TASK: Task = { + id: "hn-top-story", + prompt: "Go to https://news.ycombinator.com and tell me the title of the current top story.", +}; + +const MODEL: CuaModelRef = "anthropic:claude-opus-4-6"; + +async function main(): Promise { + console.log(`[bench] running task "${TASK.id}" on ${MODEL}`); + const result = await runTask(MODEL, TASK); + console.log(JSON.stringify(result, null, 2)); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts new file mode 100644 index 0000000..55c1a59 --- /dev/null +++ b/packages/bench/src/types.ts @@ -0,0 +1,31 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; + +/** A single benchmark task to run against a model. */ +export interface Task { + id: string; + prompt: string; +} + +/** Token totals summed across every model call in a run. */ +export interface TokenTotals { + input: number; + output: number; + total: number; +} + +/** Outcome of running one task on one model. */ +export interface TaskResult { + model: CuaModelRef; + taskId: string; + /** null until an accuracy judge scores the run. */ + success: boolean | null; + stopReason: string; + finalText: string; + errorMessage?: string; + wallClockMs: number; + /** Number of agent turns taken. */ + steps: number; + tokens: TokenTotals; + /** null when the provider doesn't report a cost. */ + costUsd: number | null; +} diff --git a/packages/bench/tsconfig.build.json b/packages/bench/tsconfig.build.json new file mode 100644 index 0000000..7802bbd --- /dev/null +++ b/packages/bench/tsconfig.build.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist-tsc", + "rootDir": "./src", + "emitDeclarationOnly": true, + "sourceMap": false, + "declarationMap": false + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"], + "references": [{ "path": "../ai" }, { "path": "../agent" }] +} diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json new file mode 100644 index 0000000..d8faaf5 --- /dev/null +++ b/packages/bench/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "./tsconfig.build.json" +} diff --git a/tsconfig.json b/tsconfig.json index aad9af1..6166f1e 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,6 +4,7 @@ { "path": "./packages/ai" }, { "path": "./packages/agent" }, { "path": "./packages/ptywright" }, - { "path": "./packages/cli" } + { "path": "./packages/cli" }, + { "path": "./packages/bench" } ] } From 6e4941aae4d6c19ad451216a6fc6f2d915426f95 Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 15:58:11 +0000 Subject: [PATCH 2/5] Add Online-Mind2Web benchmark harness to packages/bench Build the full standard-benchmark pipeline on top of the single-task runner: load the osunlp/Online-Mind2Web tasks, run them across models on Kernel browsers (stealth, fresh profile, 600s, concurrency-capped, resumable), and emit official online-mind2web-v2 trajectories plus a cost/speed sidecar per task. Accuracy is scored by the official WebJudge (via scripts/run-webjudge.sh) rather than a reimplementation; aggregate.ts rolls results into the per-model accuracy/cost/speed table. fetch-tasks.py loads the gated dataset. --- packages/bench/.gitignore | 2 + packages/bench/package.json | 2 + packages/bench/scripts/fetch-tasks.py | 41 +++++++++ packages/bench/scripts/run-webjudge.sh | 36 ++++++++ packages/bench/src/aggregate.ts | 99 +++++++++++++++++++++ packages/bench/src/benchmark.ts | 99 +++++++++++++++++++++ packages/bench/src/browser.ts | 53 +++++++++++ packages/bench/src/index.ts | 22 ++++- packages/bench/src/pool.ts | 16 ++++ packages/bench/src/runOne.ts | 117 +++++++++++++++++++++++++ packages/bench/src/tasks.ts | 18 ++++ packages/bench/src/trajectory.ts | 100 +++++++++++++++++++++ packages/bench/src/types.ts | 49 +++++++++++ 13 files changed, 653 insertions(+), 1 deletion(-) create mode 100644 packages/bench/.gitignore create mode 100755 packages/bench/scripts/fetch-tasks.py create mode 100755 packages/bench/scripts/run-webjudge.sh create mode 100644 packages/bench/src/aggregate.ts create mode 100644 packages/bench/src/benchmark.ts create mode 100644 packages/bench/src/browser.ts create mode 100644 packages/bench/src/pool.ts create mode 100644 packages/bench/src/runOne.ts create mode 100644 packages/bench/src/tasks.ts create mode 100644 packages/bench/src/trajectory.ts diff --git a/packages/bench/.gitignore b/packages/bench/.gitignore new file mode 100644 index 0000000..ef55106 --- /dev/null +++ b/packages/bench/.gitignore @@ -0,0 +1,2 @@ +results/ +tasks/ diff --git a/packages/bench/package.json b/packages/bench/package.json index a67fb8b..0bcc229 100644 --- a/packages/bench/package.json +++ b/packages/bench/package.json @@ -13,6 +13,8 @@ }, "scripts": { "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts", + "bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts", + "aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts", "typecheck": "tsc -b" }, "dependencies": { diff --git a/packages/bench/scripts/fetch-tasks.py b/packages/bench/scripts/fetch-tasks.py new file mode 100755 index 0000000..8403649 --- /dev/null +++ b/packages/bench/scripts/fetch-tasks.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +"""Fetch Online-Mind2Web tasks into the JSON the TS harness reads. + +The dataset is gated, so this needs an HF token (HF_TOKEN env, or `huggingface-cli login`). + + pip install datasets + HF_TOKEN=hf_... python scripts/fetch-tasks.py --out tasks/online-mind2web-test.json +""" +import argparse +import json +import os +from pathlib import Path + +from datasets import load_dataset + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--out", default="tasks/online-mind2web-test.json") + parser.add_argument("--split", default="test") + args = parser.parse_args() + + ds = load_dataset("osunlp/Online-Mind2Web", split=args.split, token=os.environ.get("HF_TOKEN")) + tasks = [ + { + "task_id": row["task_id"], + "website": row["website"], + "confirmed_task": row["confirmed_task"], + "reference_length": int(row["reference_length"]) if row.get("reference_length") is not None else 1, + } + for row in ds + ] + + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(tasks, indent=2)) + print(f"wrote {len(tasks)} tasks to {out}") + + +if __name__ == "__main__": + main() diff --git a/packages/bench/scripts/run-webjudge.sh b/packages/bench/scripts/run-webjudge.sh new file mode 100755 index 0000000..f56801f --- /dev/null +++ b/packages/bench/scripts/run-webjudge.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Score benchmark trajectories with the OFFICIAL Online-Mind2Web WebJudge. +# +# Clones the upstream OSU-NLP repo and runs its WebJudge over each model's +# trajectories (which the harness already wrote in the official v2 schema), +# then normalizes the output to /webjudge.jsonl for the aggregator. +# +# OPENAI_API_KEY=... scripts/run-webjudge.sh results [judge-model] [score-threshold] +set -euo pipefail + +RESULTS_DIR="$(cd "${1:-results}" && pwd)" +JUDGE_MODEL="${2:-o4-mini}" +THRESHOLD="${3:-3}" +: "${OPENAI_API_KEY:?OPENAI_API_KEY is required for WebJudge}" + +WORKDIR="$(mktemp -d)" +trap 'rm -rf "$WORKDIR"' EXIT +git clone --depth 1 https://github.com/OSU-NLP-Group/Online-Mind2Web "$WORKDIR/om2w" +pip install -q -r "$WORKDIR/om2w/requirements.txt" + +for MODEL_DIR in "$RESULTS_DIR"/*/; do + [ -d "$MODEL_DIR" ] || continue + MODEL_DIR="${MODEL_DIR%/}" + echo "== WebJudge: $MODEL_DIR ==" + ( cd "$WORKDIR/om2w/src" && python run.py \ + --mode WebJudge_Online_Mind2Web_eval \ + --model "$JUDGE_MODEL" \ + --trajectories_dir "$MODEL_DIR" \ + --api_key "$OPENAI_API_KEY" \ + --output_path "$MODEL_DIR" \ + --score_threshold "$THRESHOLD" ) + OUT="$MODEL_DIR/WebJudge_Online_Mind2Web_eval_${JUDGE_MODEL}_score_threshold_${THRESHOLD}_auto_eval_results.json" + [ -f "$OUT" ] && cp "$OUT" "$MODEL_DIR/webjudge.jsonl" +done + +echo "WebJudge complete — aggregate with: npm run aggregate" diff --git a/packages/bench/src/aggregate.ts b/packages/bench/src/aggregate.ts new file mode 100644 index 0000000..1dcae79 --- /dev/null +++ b/packages/bench/src/aggregate.ts @@ -0,0 +1,99 @@ +import { readdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { ModelSummary, TaskMetrics } from "./types"; + +/** + * Roll per-task results into one ModelSummary per model. Cost/speed come from + * the `metrics.json` sidecars; accuracy comes from an optional WebJudge output + * (`/webjudge.jsonl`, one `{task_id, predicted_label}` per line) written + * by `scripts/run-webjudge.sh`. Accuracy is null until that file exists. + */ +export async function aggregate(outDir: string): Promise { + const summaries: ModelSummary[] = []; + const modelDirs = await readdir(outDir, { withFileTypes: true }); + + for (const entry of modelDirs) { + if (!entry.isDirectory()) continue; + const modelDir = join(outDir, entry.name); + const metrics = await readMetrics(modelDir); + if (metrics.length === 0) continue; + + const judged = await readJudgements(join(modelDir, "webjudge.jsonl")); + const costs = metrics.map((m) => m.costUsd).filter((c): c is number => c !== null); + const passed = judged ? metrics.filter((m) => judged.get(m.task_id) === true).length : null; + + summaries.push({ + model: metrics[0]!.model, + tasks: metrics.length, + passed, + accuracyPct: judged ? round((passed! / judged.size) * 100, 1) : null, + avgCostUsd: costs.length ? round(sum(costs) / costs.length, 4) : null, + avgSpeedSec: round(sum(metrics.map((m) => m.wallClockMs)) / metrics.length / 1000, 1), + }); + } + + await writeFile(join(outDir, "summary.json"), `${JSON.stringify(summaries, null, 2)}\n`); + printTable(summaries); + return summaries; +} + +async function readMetrics(modelDir: string): Promise { + const out: TaskMetrics[] = []; + for (const entry of await readdir(modelDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + try { + out.push(JSON.parse(await readFile(join(modelDir, entry.name, "metrics.json"), "utf8"))); + } catch { + // task dir without a finished metrics.json — not yet run + } + } + return out; +} + +async function readJudgements(path: string): Promise | undefined> { + let raw: string; + try { + raw = await readFile(path, "utf8"); + } catch { + return undefined; + } + const map = new Map(); + for (const line of raw.split("\n")) { + if (!line.trim()) continue; + const row = JSON.parse(line) as { task_id: string; predicted_label: unknown }; + map.set(row.task_id, isPass(row.predicted_label)); + } + return map; +} + +function isPass(label: unknown): boolean { + if (typeof label === "number") return label === 1; + if (typeof label === "boolean") return label; + if (typeof label === "string") return ["1", "success", "yes", "true"].includes(label.toLowerCase()); + return false; +} + +function printTable(summaries: ModelSummary[]): void { + console.log("\nmodel\taccuracy\tcost/task\tspeed"); + for (const s of summaries) { + const acc = s.accuracyPct === null ? "—" : `${s.accuracyPct}%`; + const cost = s.avgCostUsd === null ? "—" : `$${s.avgCostUsd}`; + console.log(`${s.model}\t${acc}\t${cost}\t${s.avgSpeedSec}s`); + } +} + +function sum(xs: number[]): number { + return xs.reduce((a, b) => a + b, 0); +} + +function round(x: number, places: number): number { + const f = 10 ** places; + return Math.round(x * f) / f; +} + +if (process.argv[1]?.endsWith("aggregate.ts")) { + aggregate(process.argv[2] ?? "results").catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/packages/bench/src/benchmark.ts b/packages/bench/src/benchmark.ts new file mode 100644 index 0000000..3d5b054 --- /dev/null +++ b/packages/bench/src/benchmark.ts @@ -0,0 +1,99 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; +import { access } from "node:fs/promises"; +import { join } from "node:path"; +import { createKernelClient, DEFAULT_BROWSER_SETTINGS } from "./browser"; +import { runPool } from "./pool"; +import { modelSlug, runOne } from "./runOne"; +import { loadTasks } from "./tasks"; + +const DEFAULT_MODELS: CuaModelRef[] = [ + "anthropic:claude-opus-4-6", + "openai:gpt-5.5", + "google:gemini-3-flash-preview", +]; + +interface Options { + tasksPath: string; + outDir: string; + limit?: number; + concurrency: number; + models: CuaModelRef[]; +} + +function parseArgs(argv: string[]): Options { + const opts: Options = { + tasksPath: "tasks/online-mind2web-test.json", + outDir: "results", + concurrency: 5, + models: DEFAULT_MODELS, + }; + for (let i = 0; i < argv.length; i++) { + const value = () => argv[++i] ?? ""; + switch (argv[i]) { + case "--tasks": + opts.tasksPath = value(); + break; + case "--out": + opts.outDir = value(); + break; + case "--limit": + opts.limit = Number(value()); + break; + case "--concurrency": + opts.concurrency = Number(value()); + break; + case "--models": + opts.models = value().split(",").map((s) => s.trim()) as CuaModelRef[]; + break; + } + } + return opts; +} + +async function exists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +async function main(): Promise { + const opts = parseArgs(process.argv.slice(2)); + const client = createKernelClient(); + const tasks = await loadTasks(opts.tasksPath, opts.limit); + console.log(`[bench] ${tasks.length} tasks × ${opts.models.length} models, concurrency ${opts.concurrency}`); + + for (const model of opts.models) { + const slug = modelSlug(model); + console.log(`[bench] === ${model} ===`); + let done = 0; + let failed = 0; + let skipped = 0; + await runPool(tasks, opts.concurrency, async (task) => { + const taskDir = join(opts.outDir, slug, task.task_id); + if (await exists(join(taskDir, "result.json"))) { + skipped++; + return; + } + try { + const m = await runOne(client, model, task, DEFAULT_BROWSER_SETTINGS, taskDir); + done++; + console.log(`[bench] ${slug} ${task.task_id} ok steps=${m.steps} ${(m.wallClockMs / 1000).toFixed(1)}s`); + } catch (err) { + failed++; + console.error(`[bench] ${slug} ${task.task_id} FAILED: ${(err as Error).message}`); + } + }); + console.log(`[bench] ${slug}: done=${done} skipped=${skipped} failed=${failed}`); + } + + console.log(`[bench] complete — results in ${opts.outDir}/`); + console.log("[bench] next: score with scripts/run-webjudge.sh, then aggregate with src/aggregate.ts"); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/bench/src/browser.ts b/packages/bench/src/browser.ts new file mode 100644 index 0000000..01f5b18 --- /dev/null +++ b/packages/bench/src/browser.ts @@ -0,0 +1,53 @@ +import type { KernelBrowser } from "@onkernel/cua-agent"; +import Kernel from "@onkernel/sdk"; + +/** Browser configuration held constant across every model so the only variable is the model. */ +export interface BrowserSettings { + stealth: boolean; + viewport: { width: number; height: number }; + timeoutSeconds: number; +} + +/** Benchmark defaults: stealth on, fresh unauthenticated profile, generous timeout. */ +export const DEFAULT_BROWSER_SETTINGS: BrowserSettings = { + stealth: true, + viewport: { width: 1280, height: 800 }, + timeoutSeconds: 600, +}; + +export interface BrowserHandle { + client: Kernel; + browser: KernelBrowser; + close(): Promise; +} + +export function createKernelClient(apiKey?: string): Kernel { + const key = apiKey ?? process.env.KERNEL_API_KEY; + if (!key) throw new Error("KERNEL_API_KEY is required"); + return new Kernel({ apiKey: key }); +} + +/** Provision a fresh Kernel browser under the given settings. */ +export async function provisionBrowser(client: Kernel, settings: BrowserSettings): Promise { + const browser = await client.browsers.create({ + stealth: settings.stealth, + viewport: settings.viewport, + timeout_seconds: settings.timeoutSeconds, + }); + return { + client, + browser, + close: async () => { + await client.browsers.deleteByID(browser.session_id).catch(() => {}); + }, + }; +} + +export async function captureScreenshot(client: Kernel, sessionId: string): Promise { + try { + const response = await client.browsers.computer.captureScreenshot(sessionId); + return Buffer.from(await response.arrayBuffer()); + } catch { + return undefined; + } +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index 1b7489e..da7f27d 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -1,2 +1,22 @@ export { runTask, type RunTaskOptions } from "./runTask"; -export type { Task, TaskResult, TokenTotals } from "./types"; +export { runOne, modelSlug } from "./runOne"; +export { loadTasks } from "./tasks"; +export { aggregate } from "./aggregate"; +export { runPool } from "./pool"; +export { recordTrajectory } from "./trajectory"; +export { + type BrowserSettings, + DEFAULT_BROWSER_SETTINGS, + provisionBrowser, + createKernelClient, +} from "./browser"; +export type { + ActionStep, + ModelSummary, + Om2wResult, + Om2wTask, + Task, + TaskMetrics, + TaskResult, + TokenTotals, +} from "./types"; diff --git a/packages/bench/src/pool.ts b/packages/bench/src/pool.ts new file mode 100644 index 0000000..89aa66c --- /dev/null +++ b/packages/bench/src/pool.ts @@ -0,0 +1,16 @@ +/** Run `worker` over `items` with at most `concurrency` in flight at once. */ +export async function runPool( + items: T[], + concurrency: number, + worker: (item: T, index: number) => Promise, +): Promise { + let next = 0; + const lanes = Array.from({ length: Math.min(Math.max(1, concurrency), items.length) }, async () => { + while (true) { + const index = next++; + if (index >= items.length) return; + await worker(items[index]!, index); + } + }); + await Promise.all(lanes); +} diff --git a/packages/bench/src/runOne.ts b/packages/bench/src/runOne.ts new file mode 100644 index 0000000..8a58480 --- /dev/null +++ b/packages/bench/src/runOne.ts @@ -0,0 +1,117 @@ +import { CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; +import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; +import type Kernel from "@onkernel/sdk"; +import { mkdir, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { type BrowserSettings, captureScreenshot, provisionBrowser } from "./browser"; +import { recordTrajectory, type TrajectoryRecording } from "./trajectory"; +import type { ActionStep, Om2wResult, Om2wTask, TaskMetrics } from "./types"; + +/** Filesystem-safe slug for a provider-qualified model ref like `anthropic:claude-opus-4-6`. */ +export function modelSlug(model: CuaModelRef): string { + return model.replace(/[^a-zA-Z0-9._-]/g, "-"); +} + +function buildPrompt(task: Om2wTask): string { + if (task.website && !task.confirmed_task.toLowerCase().includes(task.website.toLowerCase())) { + return `Go to ${task.website} and ${task.confirmed_task}`; + } + return task.confirmed_task; +} + +/** + * Run one Online-Mind2Web task on one model against a fresh Kernel browser and + * write the official v2 trajectory (`result.json` + `trajectory/`) plus a + * `metrics.json` cost/speed sidecar into `taskDir`. + */ +export async function runOne( + client: Kernel, + model: CuaModelRef, + task: Om2wTask, + settings: BrowserSettings, + taskDir: string, +): Promise { + const handle = await provisionBrowser(client, settings); + const cwd = process.cwd(); + const repo = new JsonlSessionRepo({ + fs: new NodeExecutionEnv({ cwd }), + sessionsRoot: join(tmpdir(), "cua-bench", "sessions"), + }); + const session = await repo.create({ cwd }); + + const harness = new CuaAgentHarness({ + env: new NodeExecutionEnv({ cwd }), + session, + model, + browser: handle.browser, + client, + systemPrompt: ({ model: active }) => resolveCuaRuntimeSpec(active).defaultSystemPrompt, + getApiKeyAndHeaders: async (resolved) => { + const apiKey = getCuaEnvApiKey(resolved.provider); + return apiKey ? { apiKey } : undefined; + }, + }); + + const { recording, stop } = recordTrajectory(harness); + const startedAt = Date.now(); + let stopReason = "completed"; + let errorMessage: string | undefined; + try { + const shot = await captureScreenshot(handle.client, handle.browser.session_id); + const images: ImageContent[] | undefined = shot + ? [{ type: "image", data: shot.toString("base64"), mimeType: "image/png" }] + : undefined; + const assistant = await harness.prompt(buildPrompt(task), images ? { images } : undefined); + stopReason = assistant.stopReason; + if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { + errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; + } + } finally { + stop(); + await handle.close(); + } + + const wallClockMs = Date.now() - startedAt; + const metrics: TaskMetrics = { + task_id: task.task_id, + model, + wallClockMs, + steps: recording.turns, + tokens: recording.tokens, + costUsd: recording.costUsd, + stopReason, + errorMessage, + }; + await writeArtifacts(taskDir, task, recording, metrics); + return metrics; +} + +async function writeArtifacts( + taskDir: string, + task: Om2wTask, + recording: TrajectoryRecording, + metrics: TaskMetrics, +): Promise { + const trajectoryDir = join(taskDir, "trajectory"); + await mkdir(trajectoryDir, { recursive: true }); + + const action_history: ActionStep[] = []; + for (let i = 0; i < recording.steps.length; i++) { + const step = recording.steps[i]!; + const screenshot = `${String(i).padStart(4, "0")}.png`; + await writeFile(join(trajectoryDir, screenshot), step.screenshot); + action_history.push({ step: i, screenshot, action: step.action, thought: step.thought, url: null }); + } + + const result: Om2wResult = { + schema_version: "online-mind2web-v2", + task: task.confirmed_task, + task_id: task.task_id, + agent_final_answer: recording.finalAnswer, + reference_length: task.reference_length, + action_history, + }; + await writeFile(join(taskDir, "result.json"), `${JSON.stringify(result, null, 2)}\n`); + await writeFile(join(taskDir, "metrics.json"), `${JSON.stringify(metrics, null, 2)}\n`); +} diff --git a/packages/bench/src/tasks.ts b/packages/bench/src/tasks.ts new file mode 100644 index 0000000..4a5c87a --- /dev/null +++ b/packages/bench/src/tasks.ts @@ -0,0 +1,18 @@ +import { readFile } from "node:fs/promises"; +import type { Om2wTask } from "./types"; + +/** + * Load Online-Mind2Web tasks from a local JSON file produced by + * `scripts/fetch-tasks.py` (the dataset is gated, so it's fetched with the + * official `datasets` loader rather than over HTTP). + */ +export async function loadTasks(path: string, limit?: number): Promise { + let raw: string; + try { + raw = await readFile(path, "utf8"); + } catch { + throw new Error(`task file not found at ${path} — generate it with: python scripts/fetch-tasks.py --out ${path}`); + } + const tasks = JSON.parse(raw) as Om2wTask[]; + return typeof limit === "number" ? tasks.slice(0, limit) : tasks; +} diff --git a/packages/bench/src/trajectory.ts b/packages/bench/src/trajectory.ts new file mode 100644 index 0000000..4acf128 --- /dev/null +++ b/packages/bench/src/trajectory.ts @@ -0,0 +1,100 @@ +import type { AgentHarnessEvent, CuaAgentHarness } from "@onkernel/cua-agent"; +import type { TokenTotals } from "./types"; + +/** A trajectory step before it's assigned a screenshot filename. */ +export interface RecordedStep { + action: string; + thought: string | null; + screenshot: Buffer; +} + +export interface TrajectoryRecording { + steps: RecordedStep[]; + finalAnswer: string | null; + tokens: TokenTotals; + costUsd: number | null; + turns: number; +} + +/** + * Subscribe to a running harness and accumulate the data WebJudge needs: + * one step per computer action that produced a screenshot, the agent's final + * answer, and summed token/cost usage. Returns the live recording plus an + * unsubscribe handle. + */ +export function recordTrajectory(harness: CuaAgentHarness): { + recording: TrajectoryRecording; + stop: () => void; +} { + const recording: TrajectoryRecording = { + steps: [], + finalAnswer: null, + tokens: { input: 0, output: 0, total: 0 }, + costUsd: null, + turns: 0, + }; + const pendingActions = new Map(); + let currentThought: string | null = null; + + const stop = harness.subscribe((event: AgentHarnessEvent) => { + switch (event.type) { + case "turn_start": + recording.turns += 1; + return; + case "message_end": { + if (event.message.role !== "assistant") return; + const text = textOf(event.message.content); + if (text) { + currentThought = text; + recording.finalAnswer = text; + } + const { usage } = event.message; + recording.tokens.input += usage.input; + recording.tokens.output += usage.output; + recording.tokens.total += usage.totalTokens; + if (usage.cost.total > 0) recording.costUsd = (recording.costUsd ?? 0) + usage.cost.total; + return; + } + case "tool_execution_start": + pendingActions.set(event.toolCallId, formatAction(event.toolName, event.args)); + return; + case "tool_execution_end": { + const action = pendingActions.get(event.toolCallId) ?? event.toolName; + pendingActions.delete(event.toolCallId); + const screenshot = screenshotOf(event.result); + if (screenshot) recording.steps.push({ action, thought: currentThought, screenshot }); + return; + } + default: + return; + } + }); + + return { recording, stop }; +} + +function formatAction(toolName: string, args: unknown): string { + const rendered = args && typeof args === "object" ? JSON.stringify(args) : String(args ?? ""); + return rendered ? `${toolName} ${rendered}` : toolName; +} + +function screenshotOf(result: unknown): Buffer | undefined { + const content = (result as { content?: Array<{ type?: string; data?: string }> } | undefined)?.content; + if (!content) return undefined; + for (const c of content) { + if (c?.type === "image" && typeof c.data === "string") return Buffer.from(c.data, "base64"); + } + return undefined; +} + +function textOf(content: unknown): string { + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + const parts: string[] = []; + for (const c of content) { + if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { + parts.push((c as { text: string }).text); + } + } + return parts.join("\n"); +} diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index 55c1a59..e7b6070 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -29,3 +29,52 @@ export interface TaskResult { /** null when the provider doesn't report a cost. */ costUsd: number | null; } + +/** A task from the osunlp/Online-Mind2Web dataset. */ +export interface Om2wTask { + task_id: string; + website: string; + confirmed_task: string; + reference_length: number; +} + +/** One step of an Online-Mind2Web v2 trajectory. */ +export interface ActionStep { + step: number; + screenshot: string; + action: string; + thought: string | null; + url: string | null; +} + +/** A result.json conforming to the official `online-mind2web-v2` submission schema. */ +export interface Om2wResult { + schema_version: "online-mind2web-v2"; + task: string; + task_id: string; + agent_final_answer: string | null; + reference_length: number; + action_history: ActionStep[]; +} + +/** Per-run cost/speed sidecar, kept out of result.json so the latter stays schema-pure. */ +export interface TaskMetrics { + task_id: string; + model: CuaModelRef; + wallClockMs: number; + steps: number; + tokens: TokenTotals; + costUsd: number | null; + stopReason: string; + errorMessage?: string; +} + +/** Aggregated accuracy/cost/speed for one model — the numbers that fill the page. */ +export interface ModelSummary { + model: CuaModelRef; + tasks: number; + passed: number | null; + accuracyPct: number | null; + avgCostUsd: number | null; + avgSpeedSec: number; +} From aa031046905d508fc17323554c556fcda5a4ee78 Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:32:20 +0000 Subject: [PATCH 3/5] Deslop: drop the superseded single-task spike runOne + the benchmark CLI fully cover the spike's single-task path, so remove runTask.ts/spike.ts and their now-dead Task/TaskResult types. Eliminates the duplicated screenshot helper, textOf, browser provisioning, and usage accumulation that the spike carried alongside the harness. --- packages/bench/package.json | 1 - packages/bench/src/index.ts | 3 - packages/bench/src/runTask.ts | 128 ---------------------------------- packages/bench/src/spike.ts | 21 ------ packages/bench/src/types.ts | 23 ------ 5 files changed, 176 deletions(-) delete mode 100644 packages/bench/src/runTask.ts delete mode 100644 packages/bench/src/spike.ts diff --git a/packages/bench/package.json b/packages/bench/package.json index 0bcc229..bda0753 100644 --- a/packages/bench/package.json +++ b/packages/bench/package.json @@ -12,7 +12,6 @@ } }, "scripts": { - "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts", "bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts", "aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts", "typecheck": "tsc -b" diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index da7f27d..e214a07 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -1,4 +1,3 @@ -export { runTask, type RunTaskOptions } from "./runTask"; export { runOne, modelSlug } from "./runOne"; export { loadTasks } from "./tasks"; export { aggregate } from "./aggregate"; @@ -15,8 +14,6 @@ export type { ModelSummary, Om2wResult, Om2wTask, - Task, TaskMetrics, - TaskResult, TokenTotals, } from "./types"; diff --git a/packages/bench/src/runTask.ts b/packages/bench/src/runTask.ts deleted file mode 100644 index 32356e2..0000000 --- a/packages/bench/src/runTask.ts +++ /dev/null @@ -1,128 +0,0 @@ -import { type AgentHarnessEvent, CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; -import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; -import Kernel from "@onkernel/sdk"; -import { tmpdir } from "node:os"; -import { join } from "node:path"; -import type { Task, TaskResult, TokenTotals } from "./types"; - -export interface RunTaskOptions { - /** Kernel API key. Defaults to `KERNEL_API_KEY`. */ - kernelApiKey?: string; - /** Kernel browser session lifetime in seconds. Defaults to 300. */ - timeoutSeconds?: number; - /** Root directory for jsonl transcripts. Defaults to a temp dir. */ - sessionsRoot?: string; -} - -/** - * Run a single benchmark task on a single model against a fresh Kernel - * browser. Returns timing and token totals; `success` and `costUsd` are - * not scored here. - */ -export async function runTask( - modelRef: CuaModelRef, - task: Task, - options: RunTaskOptions = {}, -): Promise { - const kernelApiKey = options.kernelApiKey ?? process.env.KERNEL_API_KEY; - if (!kernelApiKey) throw new Error("KERNEL_API_KEY is required to run a benchmark task"); - - const client = new Kernel({ apiKey: kernelApiKey }); - const browser = await client.browsers.create({ - stealth: true, - timeout_seconds: options.timeoutSeconds && options.timeoutSeconds > 0 ? options.timeoutSeconds : 300, - }); - - const cwd = process.cwd(); - const repo = new JsonlSessionRepo({ - fs: new NodeExecutionEnv({ cwd }), - sessionsRoot: options.sessionsRoot ?? join(tmpdir(), "cua-bench", "sessions"), - }); - const session = await repo.create({ cwd }); - - const tokens: TokenTotals = { input: 0, output: 0, total: 0 }; - let costUsd: number | null = null; - let steps = 0; - - const harness = new CuaAgentHarness({ - env: new NodeExecutionEnv({ cwd }), - session, - model: modelRef, - browser, - client, - systemPrompt: ({ model }) => resolveCuaRuntimeSpec(model).defaultSystemPrompt, - getApiKeyAndHeaders: async (resolved) => { - const apiKey = getCuaEnvApiKey(resolved.provider); - return apiKey ? { apiKey } : undefined; - }, - }); - - const unsubscribe = harness.subscribe((event: AgentHarnessEvent) => { - if (event.type === "turn_start") { - steps += 1; - return; - } - if (event.type === "message_end" && event.message.role === "assistant") { - const { usage } = event.message; - tokens.input += usage.input; - tokens.output += usage.output; - tokens.total += usage.totalTokens; - if (usage.cost.total > 0) costUsd = (costUsd ?? 0) + usage.cost.total; - } - }); - - const startedAt = Date.now(); - let stopReason = "completed"; - let finalText = ""; - let errorMessage: string | undefined; - try { - const screenshot = await captureScreenshot(client, browser.session_id); - const images: ImageContent[] | undefined = screenshot - ? [{ type: "image", data: screenshot, mimeType: "image/png" }] - : undefined; - const assistant = await harness.prompt(task.prompt, images ? { images } : undefined); - stopReason = assistant.stopReason; - finalText = textOf(assistant.content); - if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { - errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; - } - } finally { - unsubscribe(); - await client.browsers.deleteByID(browser.session_id).catch(() => {}); - } - - return { - model: modelRef, - taskId: task.id, - success: null, - stopReason, - finalText, - errorMessage, - wallClockMs: Date.now() - startedAt, - steps, - tokens, - costUsd, - }; -} - -async function captureScreenshot(client: Kernel, sessionId: string): Promise { - try { - const response = await client.browsers.computer.captureScreenshot(sessionId); - const arrayBuffer = await response.arrayBuffer(); - return Buffer.from(arrayBuffer).toString("base64"); - } catch { - return undefined; - } -} - -function textOf(content: unknown): string { - if (typeof content === "string") return content; - if (!Array.isArray(content)) return ""; - const parts: string[] = []; - for (const c of content) { - if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { - parts.push((c as { text: string }).text); - } - } - return parts.join("\n"); -} diff --git a/packages/bench/src/spike.ts b/packages/bench/src/spike.ts deleted file mode 100644 index efd28dd..0000000 --- a/packages/bench/src/spike.ts +++ /dev/null @@ -1,21 +0,0 @@ -import type { CuaModelRef } from "@onkernel/cua-ai"; -import { runTask } from "./runTask"; -import type { Task } from "./types"; - -const TASK: Task = { - id: "hn-top-story", - prompt: "Go to https://news.ycombinator.com and tell me the title of the current top story.", -}; - -const MODEL: CuaModelRef = "anthropic:claude-opus-4-6"; - -async function main(): Promise { - console.log(`[bench] running task "${TASK.id}" on ${MODEL}`); - const result = await runTask(MODEL, TASK); - console.log(JSON.stringify(result, null, 2)); -} - -main().catch((err) => { - console.error(err); - process.exit(1); -}); diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index e7b6070..c711463 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -1,11 +1,5 @@ import type { CuaModelRef } from "@onkernel/cua-ai"; -/** A single benchmark task to run against a model. */ -export interface Task { - id: string; - prompt: string; -} - /** Token totals summed across every model call in a run. */ export interface TokenTotals { input: number; @@ -13,23 +7,6 @@ export interface TokenTotals { total: number; } -/** Outcome of running one task on one model. */ -export interface TaskResult { - model: CuaModelRef; - taskId: string; - /** null until an accuracy judge scores the run. */ - success: boolean | null; - stopReason: string; - finalText: string; - errorMessage?: string; - wallClockMs: number; - /** Number of agent turns taken. */ - steps: number; - tokens: TokenTotals; - /** null when the provider doesn't report a cost. */ - costUsd: number | null; -} - /** A task from the osunlp/Online-Mind2Web dataset. */ export interface Om2wTask { task_id: string; From 7934b6d93edf394a8f92f439403366c7351c61eb Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 21:08:52 +0000 Subject: [PATCH 4/5] Don't persist result.json for failed bench runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Errored/aborted runs were writing a result.json, which the resumable logic treats as a completed task and permanently skips on retry — baking empty trajectories into the WebJudge scored set. Throw on error/aborted so the run stays retryable and is recorded as failed. Co-Authored-By: Claude Opus 4.7 --- packages/bench/src/runOne.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/bench/src/runOne.ts b/packages/bench/src/runOne.ts index 8a58480..7631805 100644 --- a/packages/bench/src/runOne.ts +++ b/packages/bench/src/runOne.ts @@ -72,6 +72,13 @@ export async function runOne( await handle.close(); } + if (stopReason === "error" || stopReason === "aborted") { + // Throw instead of persisting: result.json is the resume sentinel, so writing + // one for a failed run would bake an empty trajectory into the scored set and + // permanently skip the retry. + throw new Error(errorMessage ?? `agent stopped with ${stopReason}`); + } + const wallClockMs = Date.now() - startedAt; const metrics: TaskMetrics = { task_id: task.task_id, From 0cddee033b8a2ea38c9fd049263741f3aa993367 Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 22:48:49 +0000 Subject: [PATCH 5/5] Cap benchmark retries and count permanent failures in accuracy Deterministic errors (e.g. a trajectory that grows past the provider's max request size) would otherwise be retried forever by the resumable runner. Cap attempts per task and record them on disk, then count tasks that exhaust their retries without a trajectory toward the accuracy denominator so a task a model can never finish reads as a failure rather than dropping out of the rate. Co-Authored-By: Claude Opus 4.7 --- packages/bench/src/aggregate.ts | 25 ++++++++++++++++++++++-- packages/bench/src/benchmark.ts | 34 ++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/packages/bench/src/aggregate.ts b/packages/bench/src/aggregate.ts index 1dcae79..2398107 100644 --- a/packages/bench/src/aggregate.ts +++ b/packages/bench/src/aggregate.ts @@ -7,6 +7,11 @@ import type { ModelSummary, TaskMetrics } from "./types"; * the `metrics.json` sidecars; accuracy comes from an optional WebJudge output * (`/webjudge.jsonl`, one `{task_id, predicted_label}` per line) written * by `scripts/run-webjudge.sh`. Accuracy is null until that file exists. + * + * The accuracy denominator is every task the model attempted — completed runs + * plus tasks that exhausted their retries without producing a trajectory — so a + * task the model can never finish counts as a failure rather than silently + * dropping out of the rate. */ export async function aggregate(outDir: string): Promise { const summaries: ModelSummary[] = []; @@ -18,15 +23,17 @@ export async function aggregate(outDir: string): Promise { const metrics = await readMetrics(modelDir); if (metrics.length === 0) continue; + const failed = await countExhaustedTasks(modelDir); + const attempted = metrics.length + failed; const judged = await readJudgements(join(modelDir, "webjudge.jsonl")); const costs = metrics.map((m) => m.costUsd).filter((c): c is number => c !== null); const passed = judged ? metrics.filter((m) => judged.get(m.task_id) === true).length : null; summaries.push({ model: metrics[0]!.model, - tasks: metrics.length, + tasks: attempted, passed, - accuracyPct: judged ? round((passed! / judged.size) * 100, 1) : null, + accuracyPct: judged ? round((passed! / attempted) * 100, 1) : null, avgCostUsd: costs.length ? round(sum(costs) / costs.length, 4) : null, avgSpeedSec: round(sum(metrics.map((m) => m.wallClockMs)) / metrics.length / 1000, 1), }); @@ -50,6 +57,20 @@ async function readMetrics(modelDir: string): Promise { return out; } +/** Count tasks that recorded retry attempts but never produced a trajectory — permanent failures. */ +async function countExhaustedTasks(modelDir: string): Promise { + let count = 0; + for (const entry of await readdir(modelDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const taskDir = join(modelDir, entry.name); + const hasMetrics = await readFile(join(taskDir, "metrics.json"), "utf8").then(() => true).catch(() => false); + if (hasMetrics) continue; + const hasAttempts = await readFile(join(taskDir, "attempts"), "utf8").then(() => true).catch(() => false); + if (hasAttempts) count++; + } + return count; +} + async function readJudgements(path: string): Promise | undefined> { let raw: string; try { diff --git a/packages/bench/src/benchmark.ts b/packages/bench/src/benchmark.ts index 3d5b054..b0ec7fd 100644 --- a/packages/bench/src/benchmark.ts +++ b/packages/bench/src/benchmark.ts @@ -1,5 +1,5 @@ import type { CuaModelRef } from "@onkernel/cua-ai"; -import { access } from "node:fs/promises"; +import { access, mkdir, readFile, writeFile } from "node:fs/promises"; import { join } from "node:path"; import { createKernelClient, DEFAULT_BROWSER_SETTINGS } from "./browser"; import { runPool } from "./pool"; @@ -12,6 +12,14 @@ const DEFAULT_MODELS: CuaModelRef[] = [ "google:gemini-3-flash-preview", ]; +/** + * How many times a task may error before it's treated as a permanent failure + * for that model and stops being retried. Some tasks fail deterministically + * (e.g. a model whose trajectory grows past the provider's max request size), + * so without a cap a resumable run would retry them forever. + */ +const MAX_ATTEMPTS = 3; + interface Options { tasksPath: string; outDir: string; @@ -59,6 +67,19 @@ async function exists(path: string): Promise { } } +async function readAttempts(taskDir: string): Promise { + try { + return Number.parseInt(await readFile(join(taskDir, "attempts"), "utf8"), 10) || 0; + } catch { + return 0; + } +} + +async function recordAttempt(taskDir: string, count: number): Promise { + await mkdir(taskDir, { recursive: true }); + await writeFile(join(taskDir, "attempts"), String(count)); +} + async function main(): Promise { const opts = parseArgs(process.argv.slice(2)); const client = createKernelClient(); @@ -71,22 +92,29 @@ async function main(): Promise { let done = 0; let failed = 0; let skipped = 0; + let exhausted = 0; await runPool(tasks, opts.concurrency, async (task) => { const taskDir = join(opts.outDir, slug, task.task_id); if (await exists(join(taskDir, "result.json"))) { skipped++; return; } + const attempts = await readAttempts(taskDir); + if (attempts >= MAX_ATTEMPTS) { + exhausted++; + return; + } try { const m = await runOne(client, model, task, DEFAULT_BROWSER_SETTINGS, taskDir); done++; console.log(`[bench] ${slug} ${task.task_id} ok steps=${m.steps} ${(m.wallClockMs / 1000).toFixed(1)}s`); } catch (err) { + await recordAttempt(taskDir, attempts + 1); failed++; - console.error(`[bench] ${slug} ${task.task_id} FAILED: ${(err as Error).message}`); + console.error(`[bench] ${slug} ${task.task_id} FAILED (attempt ${attempts + 1}/${MAX_ATTEMPTS}): ${(err as Error).message}`); } }); - console.log(`[bench] ${slug}: done=${done} skipped=${skipped} failed=${failed}`); + console.log(`[bench] ${slug}: done=${done} skipped=${skipped} failed=${failed} exhausted=${exhausted}`); } console.log(`[bench] complete — results in ${opts.outDir}/`);