Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"packages/ai",
"packages/agent",
"packages/ptywright",
"packages/cli"
"packages/cli",
"packages/bench"
],
"scripts": {
"build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present",
Expand Down
2 changes: 2 additions & 0 deletions packages/bench/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
results/
tasks/
27 changes: 27 additions & 0 deletions packages/bench/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "@onkernel/cua-bench",
"version": "0.0.0",
"description": "Benchmark runner for CUA models on Kernel cloud browsers",
"license": "MIT",
"type": "module",
"private": true,
"exports": {
".": {
"types": "./dist-tsc/index.d.ts",
"source": "./src/index.ts"
}
},
"scripts": {
"bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts",
"aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts",
"typecheck": "tsc -b"
},
"dependencies": {
"@onkernel/cua-agent": "*",
"@onkernel/cua-ai": "*",
"@onkernel/sdk": "0.49.0"
},
"devDependencies": {
"tsx": "^4.21.0"
}
}
41 changes: 41 additions & 0 deletions packages/bench/scripts/fetch-tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3
"""Fetch Online-Mind2Web tasks into the JSON the TS harness reads.

The dataset is gated, so this needs an HF token (HF_TOKEN env, or `huggingface-cli login`).

pip install datasets
HF_TOKEN=hf_... python scripts/fetch-tasks.py --out tasks/online-mind2web-test.json
"""
import argparse
import json
import os
from pathlib import Path

from datasets import load_dataset


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--out", default="tasks/online-mind2web-test.json")
parser.add_argument("--split", default="test")
args = parser.parse_args()

ds = load_dataset("osunlp/Online-Mind2Web", split=args.split, token=os.environ.get("HF_TOKEN"))
tasks = [
{
"task_id": row["task_id"],
"website": row["website"],
"confirmed_task": row["confirmed_task"],
"reference_length": int(row["reference_length"]) if row.get("reference_length") is not None else 1,
}
for row in ds
]

out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(tasks, indent=2))
print(f"wrote {len(tasks)} tasks to {out}")


if __name__ == "__main__":
main()
36 changes: 36 additions & 0 deletions packages/bench/scripts/run-webjudge.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
# Score benchmark trajectories with the OFFICIAL Online-Mind2Web WebJudge.
#
# Clones the upstream OSU-NLP repo and runs its WebJudge over each model's
# trajectories (which the harness already wrote in the official v2 schema),
# then normalizes the output to <model>/webjudge.jsonl for the aggregator.
#
# OPENAI_API_KEY=... scripts/run-webjudge.sh results [judge-model] [score-threshold]
set -euo pipefail

RESULTS_DIR="$(cd "${1:-results}" && pwd)"
JUDGE_MODEL="${2:-o4-mini}"
THRESHOLD="${3:-3}"
: "${OPENAI_API_KEY:?OPENAI_API_KEY is required for WebJudge}"

WORKDIR="$(mktemp -d)"
trap 'rm -rf "$WORKDIR"' EXIT
git clone --depth 1 https://github.com/OSU-NLP-Group/Online-Mind2Web "$WORKDIR/om2w"
pip install -q -r "$WORKDIR/om2w/requirements.txt"

for MODEL_DIR in "$RESULTS_DIR"/*/; do
[ -d "$MODEL_DIR" ] || continue
MODEL_DIR="${MODEL_DIR%/}"
echo "== WebJudge: $MODEL_DIR =="
( cd "$WORKDIR/om2w/src" && python run.py \
--mode WebJudge_Online_Mind2Web_eval \
--model "$JUDGE_MODEL" \
--trajectories_dir "$MODEL_DIR" \
--api_key "$OPENAI_API_KEY" \
--output_path "$MODEL_DIR" \
--score_threshold "$THRESHOLD" )
OUT="$MODEL_DIR/WebJudge_Online_Mind2Web_eval_${JUDGE_MODEL}_score_threshold_${THRESHOLD}_auto_eval_results.json"
[ -f "$OUT" ] && cp "$OUT" "$MODEL_DIR/webjudge.jsonl"
done

echo "WebJudge complete — aggregate with: npm run aggregate"
120 changes: 120 additions & 0 deletions packages/bench/src/aggregate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import { readdir, readFile, writeFile } from "node:fs/promises";
import { join } from "node:path";
import type { ModelSummary, TaskMetrics } from "./types";

/**
* Roll per-task results into one ModelSummary per model. Cost/speed come from
* the `metrics.json` sidecars; accuracy comes from an optional WebJudge output
* (`<model>/webjudge.jsonl`, one `{task_id, predicted_label}` per line) written
* by `scripts/run-webjudge.sh`. Accuracy is null until that file exists.
*
* The accuracy denominator is every task the model attempted — completed runs
* plus tasks that exhausted their retries without producing a trajectory — so a
* task the model can never finish counts as a failure rather than silently
* dropping out of the rate.
*/
export async function aggregate(outDir: string): Promise<ModelSummary[]> {
const summaries: ModelSummary[] = [];
const modelDirs = await readdir(outDir, { withFileTypes: true });

for (const entry of modelDirs) {
if (!entry.isDirectory()) continue;
const modelDir = join(outDir, entry.name);
const metrics = await readMetrics(modelDir);
if (metrics.length === 0) continue;

const failed = await countExhaustedTasks(modelDir);
const attempted = metrics.length + failed;
const judged = await readJudgements(join(modelDir, "webjudge.jsonl"));
const costs = metrics.map((m) => m.costUsd).filter((c): c is number => c !== null);
const passed = judged ? metrics.filter((m) => judged.get(m.task_id) === true).length : null;

summaries.push({
model: metrics[0]!.model,
tasks: attempted,
passed,
accuracyPct: judged ? round((passed! / attempted) * 100, 1) : null,
avgCostUsd: costs.length ? round(sum(costs) / costs.length, 4) : null,
avgSpeedSec: round(sum(metrics.map((m) => m.wallClockMs)) / metrics.length / 1000, 1),
});
}

await writeFile(join(outDir, "summary.json"), `${JSON.stringify(summaries, null, 2)}\n`);
printTable(summaries);
return summaries;
}

async function readMetrics(modelDir: string): Promise<TaskMetrics[]> {
const out: TaskMetrics[] = [];
for (const entry of await readdir(modelDir, { withFileTypes: true })) {
if (!entry.isDirectory()) continue;
try {
out.push(JSON.parse(await readFile(join(modelDir, entry.name, "metrics.json"), "utf8")));
} catch {
// task dir without a finished metrics.json — not yet run
}
}
return out;
}

/** Count tasks that recorded retry attempts but never produced a trajectory — permanent failures. */
async function countExhaustedTasks(modelDir: string): Promise<number> {
let count = 0;
for (const entry of await readdir(modelDir, { withFileTypes: true })) {
if (!entry.isDirectory()) continue;
const taskDir = join(modelDir, entry.name);
const hasMetrics = await readFile(join(taskDir, "metrics.json"), "utf8").then(() => true).catch(() => false);
if (hasMetrics) continue;
const hasAttempts = await readFile(join(taskDir, "attempts"), "utf8").then(() => true).catch(() => false);
if (hasAttempts) count++;
}
return count;
}

async function readJudgements(path: string): Promise<Map<string, boolean> | undefined> {
let raw: string;
try {
raw = await readFile(path, "utf8");
} catch {
return undefined;
}
const map = new Map<string, boolean>();
for (const line of raw.split("\n")) {
if (!line.trim()) continue;
const row = JSON.parse(line) as { task_id: string; predicted_label: unknown };
map.set(row.task_id, isPass(row.predicted_label));
}
return map;
}

function isPass(label: unknown): boolean {
if (typeof label === "number") return label === 1;
if (typeof label === "boolean") return label;
if (typeof label === "string") return ["1", "success", "yes", "true"].includes(label.toLowerCase());
return false;
}

function printTable(summaries: ModelSummary[]): void {
console.log("\nmodel\taccuracy\tcost/task\tspeed");
for (const s of summaries) {
const acc = s.accuracyPct === null ? "—" : `${s.accuracyPct}%`;
const cost = s.avgCostUsd === null ? "—" : `$${s.avgCostUsd}`;
console.log(`${s.model}\t${acc}\t${cost}\t${s.avgSpeedSec}s`);
}
}

function sum(xs: number[]): number {
return xs.reduce((a, b) => a + b, 0);
}

function round(x: number, places: number): number {
const f = 10 ** places;
return Math.round(x * f) / f;
}

if (process.argv[1]?.endsWith("aggregate.ts")) {
aggregate(process.argv[2] ?? "results").catch((err) => {
console.error(err);
process.exit(1);
});
}
Loading
Loading