kernel · jarugupj · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -8,7 +8,8 @@
     "packages/ai",
     "packages/agent",
     "packages/ptywright",
-    "packages/cli"
+    "packages/cli",
+    "packages/bench"
   ],
   "scripts": {
     "build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present",

diff --git a/packages/bench/.gitignore b/packages/bench/.gitignore
@@ -0,0 +1,2 @@
+results/
+tasks/
diff --git a/packages/bench/package.json b/packages/bench/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "@onkernel/cua-bench",
+  "version": "0.0.0",
+  "description": "Benchmark runner for CUA models on Kernel cloud browsers",
+  "license": "MIT",
+  "type": "module",
+  "private": true,
+  "exports": {
+    ".": {
+      "types": "./dist-tsc/index.d.ts",
+      "source": "./src/index.ts"
+    }
+  },
+  "scripts": {
+    "bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts",
+    "aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts",
+    "typecheck": "tsc -b"
+  },
+  "dependencies": {
+    "@onkernel/cua-agent": "*",
+    "@onkernel/cua-ai": "*",
+    "@onkernel/sdk": "0.49.0"
+  },
+  "devDependencies": {
+    "tsx": "^4.21.0"
+  }
+}
diff --git a/packages/bench/scripts/fetch-tasks.py b/packages/bench/scripts/fetch-tasks.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+"""Fetch Online-Mind2Web tasks into the JSON the TS harness reads.
+
+The dataset is gated, so this needs an HF token (HF_TOKEN env, or `huggingface-cli login`).
+
+    pip install datasets
+    HF_TOKEN=hf_... python scripts/fetch-tasks.py --out tasks/online-mind2web-test.json
+"""
+import argparse
+import json
+import os
+from pathlib import Path
+
+from datasets import load_dataset
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", default="tasks/online-mind2web-test.json")
+    parser.add_argument("--split", default="test")
+    args = parser.parse_args()
+
+    ds = load_dataset("osunlp/Online-Mind2Web", split=args.split, token=os.environ.get("HF_TOKEN"))
+    tasks = [
+        {
+            "task_id": row["task_id"],
+            "website": row["website"],
+            "confirmed_task": row["confirmed_task"],
+            "reference_length": int(row["reference_length"]) if row.get("reference_length") is not None else 1,
+        }
+        for row in ds
+    ]
+
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(tasks, indent=2))
+    print(f"wrote {len(tasks)} tasks to {out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/bench/scripts/run-webjudge.sh b/packages/bench/scripts/run-webjudge.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Score benchmark trajectories with the OFFICIAL Online-Mind2Web WebJudge.
+#
+# Clones the upstream OSU-NLP repo and runs its WebJudge over each model's
+# trajectories (which the harness already wrote in the official v2 schema),
+# then normalizes the output to <model>/webjudge.jsonl for the aggregator.
+#
+#   OPENAI_API_KEY=... scripts/run-webjudge.sh results [judge-model] [score-threshold]
+set -euo pipefail
+
+RESULTS_DIR="$(cd "${1:-results}" && pwd)"
+JUDGE_MODEL="${2:-o4-mini}"
+THRESHOLD="${3:-3}"
+: "${OPENAI_API_KEY:?OPENAI_API_KEY is required for WebJudge}"
+
+WORKDIR="$(mktemp -d)"
+trap 'rm -rf "$WORKDIR"' EXIT
+git clone --depth 1 https://github.com/OSU-NLP-Group/Online-Mind2Web "$WORKDIR/om2w"
+pip install -q -r "$WORKDIR/om2w/requirements.txt"
+
+for MODEL_DIR in "$RESULTS_DIR"/*/; do
+    [ -d "$MODEL_DIR" ] || continue
+    MODEL_DIR="${MODEL_DIR%/}"
+    echo "== WebJudge: $MODEL_DIR =="
+    ( cd "$WORKDIR/om2w/src" && python run.py \
+        --mode WebJudge_Online_Mind2Web_eval \
+        --model "$JUDGE_MODEL" \
+        --trajectories_dir "$MODEL_DIR" \
+        --api_key "$OPENAI_API_KEY" \
+        --output_path "$MODEL_DIR" \
+        --score_threshold "$THRESHOLD" )
+    OUT="$MODEL_DIR/WebJudge_Online_Mind2Web_eval_${JUDGE_MODEL}_score_threshold_${THRESHOLD}_auto_eval_results.json"
+    [ -f "$OUT" ] && cp "$OUT" "$MODEL_DIR/webjudge.jsonl"
+done
+
+echo "WebJudge complete — aggregate with: npm run aggregate"
diff --git a/packages/bench/src/aggregate.ts b/packages/bench/src/aggregate.ts
@@ -0,0 +1,120 @@
+import { readdir, readFile, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+import type { ModelSummary, TaskMetrics } from "./types";
+
+/**
+ * Roll per-task results into one ModelSummary per model. Cost/speed come from
+ * the `metrics.json` sidecars; accuracy comes from an optional WebJudge output
+ * (`<model>/webjudge.jsonl`, one `{task_id, predicted_label}` per line) written
+ * by `scripts/run-webjudge.sh`. Accuracy is null until that file exists.
+ *
+ * The accuracy denominator is every task the model attempted — completed runs
+ * plus tasks that exhausted their retries without producing a trajectory — so a
+ * task the model can never finish counts as a failure rather than silently
+ * dropping out of the rate.
+ */
+export async function aggregate(outDir: string): Promise<ModelSummary[]> {
+	const summaries: ModelSummary[] = [];
+	const modelDirs = await readdir(outDir, { withFileTypes: true });
+
+	for (const entry of modelDirs) {
+		if (!entry.isDirectory()) continue;
+		const modelDir = join(outDir, entry.name);
+		const metrics = await readMetrics(modelDir);
+		if (metrics.length === 0) continue;
+
+		const failed = await countExhaustedTasks(modelDir);
+		const attempted = metrics.length + failed;
+		const judged = await readJudgements(join(modelDir, "webjudge.jsonl"));
+		const costs = metrics.map((m) => m.costUsd).filter((c): c is number => c !== null);
+		const passed = judged ? metrics.filter((m) => judged.get(m.task_id) === true).length : null;
+
+		summaries.push({
+			model: metrics[0]!.model,
+			tasks: attempted,
+			passed,
+			accuracyPct: judged ? round((passed! / attempted) * 100, 1) : null,
+			avgCostUsd: costs.length ? round(sum(costs) / costs.length, 4) : null,
+			avgSpeedSec: round(sum(metrics.map((m) => m.wallClockMs)) / metrics.length / 1000, 1),
+		});
+	}
+
+	await writeFile(join(outDir, "summary.json"), `${JSON.stringify(summaries, null, 2)}\n`);
+	printTable(summaries);
+	return summaries;
+}
+
+async function readMetrics(modelDir: string): Promise<TaskMetrics[]> {
+	const out: TaskMetrics[] = [];
+	for (const entry of await readdir(modelDir, { withFileTypes: true })) {
+		if (!entry.isDirectory()) continue;
+		try {
+			out.push(JSON.parse(await readFile(join(modelDir, entry.name, "metrics.json"), "utf8")));
+		} catch {
+			// task dir without a finished metrics.json — not yet run
+		}
+	}
+	return out;
+}
+
+/** Count tasks that recorded retry attempts but never produced a trajectory — permanent failures. */
+async function countExhaustedTasks(modelDir: string): Promise<number> {
+	let count = 0;
+	for (const entry of await readdir(modelDir, { withFileTypes: true })) {
+		if (!entry.isDirectory()) continue;
+		const taskDir = join(modelDir, entry.name);
+		const hasMetrics = await readFile(join(taskDir, "metrics.json"), "utf8").then(() => true).catch(() => false);
+		if (hasMetrics) continue;
+		const hasAttempts = await readFile(join(taskDir, "attempts"), "utf8").then(() => true).catch(() => false);
+		if (hasAttempts) count++;
+	}
+	return count;
+}
+
+async function readJudgements(path: string): Promise<Map<string, boolean> | undefined> {
+	let raw: string;
+	try {
+		raw = await readFile(path, "utf8");
+	} catch {
+		return undefined;
+	}
+	const map = new Map<string, boolean>();
+	for (const line of raw.split("\n")) {
+		if (!line.trim()) continue;
+		const row = JSON.parse(line) as { task_id: string; predicted_label: unknown };
+		map.set(row.task_id, isPass(row.predicted_label));
+	}
+	return map;
+}
+
+function isPass(label: unknown): boolean {
+	if (typeof label === "number") return label === 1;
+	if (typeof label === "boolean") return label;
+	if (typeof label === "string") return ["1", "success", "yes", "true"].includes(label.toLowerCase());
+	return false;
+}
+
+function printTable(summaries: ModelSummary[]): void {
+	console.log("\nmodel\taccuracy\tcost/task\tspeed");
+	for (const s of summaries) {
+		const acc = s.accuracyPct === null ? "—" : `${s.accuracyPct}%`;
+		const cost = s.avgCostUsd === null ? "—" : `$${s.avgCostUsd}`;
+		console.log(`${s.model}\t${acc}\t${cost}\t${s.avgSpeedSec}s`);
+	}
+}
+
+function sum(xs: number[]): number {
+	return xs.reduce((a, b) => a + b, 0);
+}
+
+function round(x: number, places: number): number {
+	const f = 10 ** places;
+	return Math.round(x * f) / f;
+}
+
+if (process.argv[1]?.endsWith("aggregate.ts")) {
+	aggregate(process.argv[2] ?? "results").catch((err) => {
+		console.error(err);
+		process.exit(1);
+	});
+}