From 4ae9ab8768ab35b3b0a6cfb806972d14ca48da98 Mon Sep 17 00:00:00 2001 From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> Date: Tue, 30 Jun 2026 06:06:58 +0000 Subject: [PATCH 1/6] skills: add webwright.skills memory/skill-library module + skill_use tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A built-in submodule turning solved tasks into reusable, executable code skills: - skills/{library,retrieve,decide,gate,update,llm}: store / retrieve (relevance) / decide (use·adapt·skip utility) / admission gate (gold|self_verify|none) / evolve (incremental growth on existing library) — backend-agnostic via configure_llm over webwright's own Model abstraction (no hardcoded gateway/key/path) - tools/skill_use.py: solve-time tool (agent invokes like self_reflection/image_qa) -> retrieve+decide -> JSON recommendation (use/adapt/skip + source path) - python -m webwright.skills.update --manifest batch.json --library ./lib : batch growth - tests/skills: 5 unit tests pass against the migrated module (logic == original) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/webwright/skills/__init__.py | 21 +++ src/webwright/skills/decide.py | 50 +++++++ src/webwright/skills/gate.py | 62 ++++++++ src/webwright/skills/library.py | 59 ++++++++ src/webwright/skills/llm.py | 57 ++++++++ src/webwright/skills/retrieve.py | 75 ++++++++++ src/webwright/skills/update.py | 208 +++++++++++++++++++++++++++ src/webwright/tools/skill_use.py | 72 ++++++++++ tests/skills/test_evolve.py | 49 +++++++ tests/skills/test_gate.py | 35 +++++ tests/skills/test_library.py | 33 +++++ tests/skills/test_retrieve_decide.py | 40 ++++++ tests/skills/test_update.py | 36 +++++ 13 files changed, 797 insertions(+) create mode 100644 src/webwright/skills/__init__.py create mode 100644 src/webwright/skills/decide.py create mode 100644 src/webwright/skills/gate.py create mode 100644 src/webwright/skills/library.py create mode 100644 src/webwright/skills/llm.py create mode 100644 src/webwright/skills/retrieve.py create mode 100644 src/webwright/skills/update.py create mode 100644 src/webwright/tools/skill_use.py create mode 100644 tests/skills/test_evolve.py create mode 100644 tests/skills/test_gate.py create mode 100644 tests/skills/test_library.py create mode 100644 tests/skills/test_retrieve_decide.py create mode 100644 tests/skills/test_update.py diff --git a/src/webwright/skills/__init__.py b/src/webwright/skills/__init__.py new file mode 100644 index 0000000..6f3383f --- /dev/null +++ b/src/webwright/skills/__init__.py @@ -0,0 +1,21 @@ +"""webwright.skills — a memory/skill library module for webwright. + +Store solved tasks as reusable, executable code skills; retrieve + judge (use/adapt/skip) at +solve time; admit via a gate; and grow the library incrementally (evolve). Plugs into webwright +as a built-in submodule: + - solve-time reuse : the `skill_use` tool (agent invokes it like self_reflection / image_qa) + - offline growth : `update.evolve` (run after solves to distill gate-passed solves into skills) + +Backend-agnostic: configure_llm(model) wires it to any webwright Model. +""" +from .library import Library, Skill +from .retrieve import retrieve, Candidate +from .decide import decide, Decision +from .gate import gate, GateResult +from .update import evolve, Trace # NOTE: don't import the `update` function here — it would +from .llm import configure_llm # shadow the `update` submodule. Use update.evolve / update.update. + +__all__ = [ + "Library", "Skill", "retrieve", "Candidate", "decide", "Decision", + "gate", "GateResult", "evolve", "Trace", "configure_llm", +] diff --git a/src/webwright/skills/decide.py b/src/webwright/skills/decide.py new file mode 100644 index 0000000..eaddc8d --- /dev/null +++ b/src/webwright/skills/decide.py @@ -0,0 +1,50 @@ +"""判断用不用:候选 + 任务 → use / adapt / skip(utility)。 + +接口稳定(实现可换): + decide(task, candidates, *, method="llm") -> Decision +相关 ≠ 有用:retrieve 给"像不像",decide 给"该不该用、怎么用"。 +""" +from __future__ import annotations +from dataclasses import dataclass + +from .llm import llm_json + + +@dataclass +class Decision: + verdict: str # "use" | "adapt" | "skip" + skill_id: str | None + reason: str + + +def _decide_llm(task: str, candidates) -> Decision: + if not candidates: + return Decision("skip", None, "no candidate skills") + cat = "\n".join( + f"- skill_id: {c.skill.skill_id} | template: {c.skill.meta.get('template','')} | " + f"summary: {c.skill.summary} | params: {c.skill.signature.get('params', [])}" + for c in candidates + ) + sys = ( + "Decide whether a library skill is worth using for THIS task. Output STRICT JSON: " + '{"verdict":"use|adapt|skip","skill_id":"...","reason":"..."}.\n' + "- use = the skill fits the task as-is (just different parameter values).\n" + "- adapt = the skill's expensive core (login / navigation / extraction) is reusable, but the " + "FINAL step differs; the agent should reuse the front and add/adapt only the last step.\n" + "- skip = no candidate is worth it; solve from scratch (skill_id = null).\n" + "Relevance is not enough — only 'use'/'adapt' if it genuinely saves work." + ) + user = f"## Task\n{task}\n\n## Candidate skills (most relevant first)\n{cat}" + out = llm_json(sys, user) + verdict = out.get("verdict", "skip") + if verdict not in ("use", "adapt", "skip"): + verdict = "skip" + skill_id = out.get("skill_id") if verdict != "skip" else None + return Decision(verdict=verdict, skill_id=skill_id, reason=out.get("reason", "")) + + +_DECIDERS = {"llm": _decide_llm} + + +def decide(task: str, candidates, *, method: str = "llm") -> Decision: + return _DECIDERS[method](task, candidates) diff --git a/src/webwright/skills/gate.py b/src/webwright/skills/gate.py new file mode 100644 index 0000000..4ebc43e --- /dev/null +++ b/src/webwright/skills/gate.py @@ -0,0 +1,62 @@ +"""准入闸:只有"对的"解/技能才准进库,防 correct-but-narrow / regression 污染。 +gate 是【独立第二只眼】,与解题 agent 自己的 self_reflection 不同(后者是解题完成条件)。 + +接口稳定(实现可换),method 可配置: + gate(result, *, gold=None, output_schema=None, method="auto") -> GateResult + +- method="gold" : 与 gold 比对(WebArena 等有标准答案;真独立、能挡住抽错的解)。★推荐 +- method="self_verify" : 不变量(result 非空 + shape 合 output_schema)。无 gold 时的弱占位。 + ⚠️ 局限:只查"有没有/形状对不对",不查"对不对"——抽错但非空的答案照样放行。 + (注:webwright 的 self_reflection 因 require_self_reflection_success 而恒为 + predicted_label==1,故不能用它当 gate;那是解题完成条件,非独立准入。) +- method="none" : 不把关(纯演示复用,不防污染)。 +- method="auto" : 有 gold 用 gold,否则 self_verify。 +升级路径(next step):真实站用 WebJudge(OM2W 官方 judge)或跨源一致核验,做真独立把关。 +""" +from __future__ import annotations +from dataclasses import dataclass + + +@dataclass +class GateResult: + admit: bool + reason: str + + +def _shape_ok(result, output_schema) -> bool: + if not output_schema: + return True + t = output_schema.get("type") + if t == "array": + return isinstance(result, list) + if t == "object": + return isinstance(result, dict) + if t in ("string",): + return isinstance(result, str) + if t in ("number", "integer"): + return isinstance(result, (int, float)) and not isinstance(result, bool) + return True + + +def _self_verify(result, output_schema) -> GateResult: + if result is None: + return GateResult(False, "result is null") + if isinstance(result, (list, dict, str)) and len(result) == 0: + return GateResult(False, "result is empty") + if not _shape_ok(result, output_schema): + return GateResult(False, f"shape != output_schema ({output_schema.get('type')})") + return GateResult(True, "self-verify passed (non-empty, shape ok)") + + +def _gold(result, gold) -> GateResult: + if result == gold: + return GateResult(True, "matches gold") + return GateResult(False, "differs from gold") + + +def gate(result, *, gold=None, output_schema=None, method: str = "auto") -> GateResult: + if method == "none": + return GateResult(True, "no gate (admit all)") + if method == "gold" or (method == "auto" and gold is not None): + return _gold(result, gold) + return _self_verify(result, output_schema) diff --git a/src/webwright/skills/library.py b/src/webwright/skills/library.py new file mode 100644 index 0000000..9bb6345 --- /dev/null +++ b/src/webwright/skills/library.py @@ -0,0 +1,59 @@ +"""Skill store. A skill = a directory under the library root holding skill.py + meta.json. + +Interface (stable — implementations behind it may change): + Library(root).list() -> [Skill] + Library(root).get(skill_id) -> Skill | None + Library(root).add(skill) # write skill.py + meta.json +""" +from __future__ import annotations +import json +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class Skill: + skill_id: str + code: str # source of skill.py + meta: dict = field(default_factory=dict) # {template, site, signature, summary, ...} + + @property + def summary(self) -> str: + return self.meta.get("summary", "") + + @property + def signature(self) -> dict: + return self.meta.get("signature", {}) + + +class Library: + def __init__(self, root: str | Path): + self.root = Path(root) + self.root.mkdir(parents=True, exist_ok=True) + + def _dir(self, skill_id: str) -> Path: + return self.root / skill_id + + def list(self) -> list[Skill]: + out = [] + for d in sorted(self.root.iterdir()): + if (d / "meta.json").exists(): + out.append(self.get(d.name)) + return [s for s in out if s] + + def get(self, skill_id: str) -> Skill | None: + d = self._dir(skill_id) + if not (d / "meta.json").exists(): + return None + meta = json.loads((d / "meta.json").read_text()) + code = (d / "skill.py").read_text() if (d / "skill.py").exists() else "" + return Skill(skill_id=skill_id, code=code, meta=meta) + + def add(self, skill: Skill) -> None: + d = self._dir(skill.skill_id) + d.mkdir(parents=True, exist_ok=True) + (d / "skill.py").write_text(skill.code) + (d / "meta.json").write_text(json.dumps(skill.meta, ensure_ascii=False, indent=2)) + + def path(self, skill_id: str) -> Path: + return self._dir(skill_id) / "skill.py" diff --git a/src/webwright/skills/llm.py b/src/webwright/skills/llm.py new file mode 100644 index 0000000..2e0703e --- /dev/null +++ b/src/webwright/skills/llm.py @@ -0,0 +1,57 @@ +"""LLM helper for the skills module — backend-agnostic, via webwright's own model abstraction. + +No hardcoded gateway/endpoint/key: the caller passes a webwright Model (or a model config dict), +so this works with any backend webwright supports (openai / anthropic / openrouter / custom). +""" +from __future__ import annotations + +import json +import re +from typing import Any, Optional + +from webwright.models import get_model + +# Process-wide default model, set once via configure_llm() so retrieve/decide/update can call +# llm() without each caller threading a Model through. Falls back to env-configured openai. +_DEFAULT_MODEL: Optional[Any] = None + + +def configure_llm(model: Any) -> None: + """Register the Model (or model-config dict) the skills module should use.""" + global _DEFAULT_MODEL + _DEFAULT_MODEL = get_model(model) if isinstance(model, dict) else model + + +def _model() -> Any: + if _DEFAULT_MODEL is not None: + return _DEFAULT_MODEL + # default: openai model from env (OPENAI_API_KEY / OPENAI_BASE_URL respected by the model class) + return get_model({"model_class": "openai"}) + + +def llm(system: str, user: str, *, model: Any = None, **_: Any) -> str: + """Single-turn call. Returns raw text. `model` overrides the configured default.""" + m = model if model is not None else _model() + messages = [ + m.format_message(role="system", content=system), + m.format_message(role="user", content=user), + ] + return m(messages) + + +def llm_json(system: str, user: str, **kw: Any) -> dict: + """Call + parse the first {...} JSON object out of the reply.""" + txt = llm(system, user, **kw) + match = re.search(r"\{.*\}", txt, re.S) + if not match: + return {} + try: + return json.loads(match.group(0)) + except Exception: + s = match.group(0) + for end in range(len(s), 0, -1): + try: + return json.loads(s[:end]) + except Exception: + continue + return {} diff --git a/src/webwright/skills/retrieve.py b/src/webwright/skills/retrieve.py new file mode 100644 index 0000000..fae5d1e --- /dev/null +++ b/src/webwright/skills/retrieve.py @@ -0,0 +1,75 @@ +"""取:任务 → 最相关的候选技能(relevance)。 + +接口稳定(实现可换): + retrieve(task, library, *, k=3, method="llm") -> [Candidate] +MVP: 单次 LLM 调用,把整库当 flat catalog 列进 prompt 让它选。库大了换 embedding,接口不变。 +""" +from __future__ import annotations +from dataclasses import dataclass + +from .library import Library, Skill +from .llm import llm_json + + +@dataclass +class Candidate: + skill: Skill + score: float # relevance 0..1 + reason: str + + +def _catalog(library: Library) -> str: + lines = [] + for s in library.list(): + lines.append( + f"- skill_id: {s.skill_id}\n" + f" template: {s.meta.get('template','')}\n" + f" site: {s.meta.get('site','')}\n" + f" summary: {s.summary}\n" + f" params: {s.signature.get('params', [])}" + ) + return "\n".join(lines) + + +def _retrieve_llm(task: str, library: Library, k: int) -> list[Candidate]: + cat = _catalog(library) + if not cat: + return [] + sys = ( + "You match a web task to the most RELEVANT skills in a catalog (relevance only — not yet " + "whether to use them). Return STRICT JSON: " + '{"candidates":[{"skill_id":"...","score":<0..1>,"reason":"..."}]}, most relevant first, ' + f"at most {k}. score = how relevant. If nothing is relevant, return an empty list." + ) + user = f"## Task\n{task}\n\n## Skill catalog\n{cat}\n\nReturn at most {k} candidates." + out = llm_json(sys, user) + cands = [] + for c in (out.get("candidates") or [])[:k]: + sk = library.get(c.get("skill_id", "")) + if sk: + try: + score = float(c.get("score", 0)) + except Exception: + score = 0.0 + cands.append(Candidate(skill=sk, score=score, reason=c.get("reason", ""))) + return cands + + +def _retrieve_simple(task: str, library: Library, k: int) -> list[Candidate]: + """No-LLM fallback: rank by keyword overlap between task and template/summary.""" + toks = set(task.lower().split()) + scored = [] + for s in library.list(): + bag = (s.meta.get("template", "") + " " + s.summary).lower().split() + overlap = len(toks & set(bag)) + if overlap: + scored.append(Candidate(skill=s, score=overlap / (len(toks) or 1), reason="keyword overlap")) + scored.sort(key=lambda c: c.score, reverse=True) + return scored[:k] + + +_RETRIEVERS = {"llm": _retrieve_llm, "simple": _retrieve_simple} + + +def retrieve(task: str, library: Library, *, k: int = 3, method: str = "llm") -> list[Candidate]: + return _RETRIEVERS[method](task, library, k) diff --git a/src/webwright/skills/update.py b/src/webwright/skills/update.py new file mode 100644 index 0000000..bcb4a80 --- /dev/null +++ b/src/webwright/skills/update.py @@ -0,0 +1,208 @@ +"""沉淀(间歇):把通过 gate 的解蒸馏、写回 library,让库从使用中长大。 + +接口稳定(实现可换): + update(traces, library, *, method="grow") -> [被加/更新的 skill_id] + +- method="grow" : 库里还没覆盖这个 template 的,就把这条成功解原样提升为技能(最小形态)。 +- method="refine" : 批量提炼——对齐 N 个 gate 过的解 → 参数化(泛化) + 拆出可复用 primitive + 薄任务层 + → 一个更好的库技能。这是"update 加泛化性 + primitive 复用性"的实现(单次 LLM 批量调用)。 +""" +from __future__ import annotations +import json +import re +from dataclasses import dataclass, field + +from .library import Library, Skill +from .llm import llm + + +@dataclass +class Trace: + template: str + code: str # 这条任务的 final_script(已过 gate = 正确) + answer: object = None + meta: dict = field(default_factory=dict) # params / site / start_url / output_schema ... + # usage:这条任务是怎么用库的(驱动 update 的信号) + used_skill_id: str | None = None + verdict: str | None = None # use | adapt | skip + correct: bool = True + + +def _slug(template: str) -> str: + s = re.sub(r"[^a-z0-9]+", "_", template.lower()).strip("_") + return s[:48] or "skill" + + +def _extract_code(txt: str) -> str: + m = re.search(r"```(?:python)?[ \t]*\n", txt) + if m: + end = txt.rfind("```") + if end > m.end(): + return txt[m.end():end] + return txt + + +_REFINE_SYS = ( + "You are given N working Python solutions that EACH solve one concrete instance of the SAME web-task " + "template (they already passed a correctness gate). Distill them into ONE better library skill.\n" + "Do TWO things:\n" + "1) GENERALIZE: align the N solutions; the parts that are IDENTICAL across them are the reusable " + "skeleton; the parts that DIFFER are parameters. Expose the differing values as function " + "arguments / taskspec params — do NOT hardcode any instance's specific values. Make extraction " + "robust (paginate/until-done, self-verify against any declared total).\n" + "2) DECOMPOSE INTO REUSABLE PRIMITIVES: factor the expensive, reusable core into clearly-named " + "primitive functions (e.g. login(), open_report(period), extract_rows()), and keep a THIN task " + "layer on top that calls them. This lets future tasks reuse the primitives even if the final step " + "differs.\n" + "Interface (fixed): the skill reads taskspec.json from sys.argv[1] " + "(taskspec = {params, start_url, credentials, output_schema}) and writes agent_response.json with " + "retrieved_data MATCHING output_schema exactly. Output ONLY the python code in one ```python block." +) + +_REFINE_INCREMENTAL = ( + "\n\nINCREMENTAL MODE: a CURRENT library skill for this template already exists (shown below). " + "Do NOT rewrite it from scratch. START from the current skill and IMPROVE it using the NEW solutions: " + "keep its working primitives and structure, only widen/fix what the new solutions reveal (handle a " + "param value it missed, make an extraction more robust, fix a bug). Preserve everything that already " + "works. Output the full improved skill in one ```python block." +) + + +def _refine(traces: list[Trace], library: Library) -> list[str]: + """批量提炼:对齐 N 个 gate 过的解 → 参数化 + primitive。 + 增量:若库里已有同 template 技能,则在【现有技能基础上】改进/加宽(而非从原始解重写)。""" + if not traces: + return [] + template = traces[0].template + schema = traces[0].meta.get("output_schema") + sid = _slug(template) + existing = library.get(sid) # 已有技能?→ 增量演化 + + blocks = [f"## Template\n{template}\n\n## Required output_schema for retrieved_data\n{json.dumps(schema)}\n"] + if existing and existing.code: + blocks.append(f"## CURRENT library skill (improve THIS, do not rewrite)\n```python\n{existing.code}\n```") + label = "NEW solutions" if existing else "Solutions" + for i, tr in enumerate(traces): + blocks.append( + f"## {label} {i} (params={json.dumps(tr.meta.get('params'), ensure_ascii=False)}, " + f"answer={json.dumps(tr.answer, ensure_ascii=False)[:120]})\n```python\n{tr.code}\n```" + ) + sys_prompt = _REFINE_SYS + (_REFINE_INCREMENTAL if existing else "") + code = _extract_code(llm(sys_prompt, "\n\n".join(blocks), max_tokens=16000, timeout=400)) + n_prev = (existing.meta.get("n_solves", 0) if existing else 0) + meta = { + "template": template, + "provenance": "update-refined-incremental" if existing else "update-refined", + "site": traces[0].meta.get("site", ""), + "summary": f"Refined from {n_prev + len(traces)} gate-passed solves; parameterized + primitives.", + "signature": {"params": list((traces[0].meta.get("params") or {}).keys()), + "call": "python skill.py taskspec.json"}, + "output_schema": schema, + "n_solves": n_prev + len(traces), + "revisions": (existing.meta.get("revisions", 1) + 1) if existing else 1, + } + library.add(Skill(skill_id=sid, code=code, meta=meta)) + return [sid] + + +def _grow(traces: list[Trace], library: Library) -> list[str]: + existing = {s.meta.get("template") for s in library.list()} + added = [] + for tr in traces: + if not tr.template or tr.template in existing: + continue + sid = _slug(tr.template) + meta = {"template": tr.template, "provenance": "distilled", **tr.meta} + library.add(Skill(skill_id=sid, code=tr.code, meta=meta)) + existing.add(tr.template) + added.append(sid) + return added + + +def evolve(traces: list[Trace], library: Library) -> dict: + """统一 update:在【已有库】上,按每条轨迹的 usage(use/adapt/skip)决定怎么改库。 + 这是"可持续增长的库"的核心——不是每次从零建,而是在 v_{n-1} 上长出 v_n。 + + - USE 成功的轨迹:技能够好,不动库(只是复用证据)。 + - ADAPT 成功的轨迹:复用了核心、fix 了末端 → 把这批 fix 后的解【提炼回该 template 的技能】 + (加宽/更稳)。这就是"fix 沉淀进库"。 + - SKIP / 库没覆盖:该 template 还没有技能 → 用这批解【新增】一个技能。 + + 只吃 gate 过(correct=True)的轨迹(防污染)。返回一份 changelog。 + """ + good = [t for t in traces if t.correct] + changelog = {"use": [], "adapt_refined": [], "added": [], "dropped_wrong": len(traces) - len(good)} + existing_templates = {s.meta.get("template"): s.skill_id for s in library.list()} + + # 按 template 分组(同族解一起提炼/沉淀) + by_tmpl: dict[str, list[Trace]] = {} + for t in good: + by_tmpl.setdefault(t.template, []).append(t) + + for tmpl, group in by_tmpl.items(): + verdicts = {t.verdict for t in group} + if tmpl not in existing_templates: + # 库没覆盖 → 新增(用这批解提炼出一个技能) + sid = _refine(group, library)[0] + changelog["added"].append(sid) + elif "adapt" in verdicts: + # 有 fix 发生 → 把 fix 后的解重新提炼回该技能(加宽/更稳) + sid = _refine(group, library)[0] # _refine 用同 slug,覆盖加宽 + changelog["adapt_refined"].append(sid) + else: + # 全 use 成功 → 技能够好,不动 + changelog["use"].append(existing_templates[tmpl]) + return changelog + + +_UPDATERS = {"grow": _grow, "refine": _refine} + + +def update(traces, library: Library, *, method: str = "grow") -> list[str]: + return _UPDATERS[method](traces, library) + + +# ---------- CLI: batch update via a manifest ---------- +def traces_from_manifest(manifest: dict) -> list["Trace"]: + """manifest = {"template": str, "runs": [{"dir","admit","params","answer"?,"verdict"?}, ...]}. + Reads each run's final_script.py; builds a Trace. correct = the run's gate verdict (admit).""" + from pathlib import Path + template = manifest.get("template", "") + out = [] + for r in manifest.get("runs", []): + d = Path(r["dir"]) + fs = d / "final_script.py" + code = fs.read_text(encoding="utf-8") if fs.exists() else "" + answer = r.get("answer") + if answer is None and (d / "agent_response.json").exists(): + try: + answer = json.load(open(d / "agent_response.json")).get("retrieved_data") + except Exception: + pass + out.append(Trace(template=template, code=code, answer=answer, + correct=bool(r.get("admit", True)), + verdict=r.get("verdict", "skip"), + used_skill_id=r.get("used_skill_id"), + meta={"params": r.get("params", {}), "site": r.get("site", ""), + "start_url": r.get("start_url", ""), + "output_schema": r.get("output_schema")})) + return out + + +def main(argv=None) -> int: + import argparse + p = argparse.ArgumentParser( + prog="python -m webwright.skills.update", + description="Batch-update the skill library from a manifest of gate-judged solves.") + p.add_argument("--manifest", required=True, help="JSON: {template, runs:[{dir,admit,params,...}]}") + p.add_argument("--library", required=True, help="Path to the skill library directory.") + a = p.parse_args(argv) + manifest = json.load(open(a.manifest, encoding="utf-8")) + traces = traces_from_manifest(manifest) + changelog = evolve(traces, Library(a.library)) + print(json.dumps(changelog, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/webwright/tools/skill_use.py b/src/webwright/tools/skill_use.py new file mode 100644 index 0000000..9acc210 --- /dev/null +++ b/src/webwright/tools/skill_use.py @@ -0,0 +1,72 @@ +"""skill_use — solve-time tool: query the skill library for a reusable skill for THIS task. + +Like self_reflection / image_qa, the agent invokes this from bash during solving: + + python -m webwright.tools.skill_use --task "Get the latest release version of facebook/react" \ + --library "$WORKSPACE_DIR/../library" + +It retrieves the most relevant skill (relevance) and judges utility (use / adapt / skip), then +prints a JSON recommendation telling the agent how to reuse it (and the path to read its source). +The agent decides: reuse as-is (use), reuse the core and change only the last step (adapt), or +solve from scratch (skip). Retrieval/judgement never block solving — on any error it prints skip. +""" +from __future__ import annotations + +import argparse +import json +import os +import sys + +from webwright.skills.library import Library +from webwright.skills.retrieve import retrieve +from webwright.skills.decide import decide + + +def recommend(task: str, library_root: str) -> dict: + lib = Library(library_root) + cands = retrieve(task, lib) + if not cands: + return {"verdict": "skip", "skill_id": None, "reason": "library has no relevant skill"} + d = decide(task, cands) + out = {"verdict": d.verdict, "skill_id": d.skill_id, "reason": d.reason} + if d.verdict != "skip" and d.skill_id: + sk = lib.get(d.skill_id) + if sk: + out["summary"] = sk.summary + out["call"] = sk.signature.get("call", "") + out["source_path"] = str(lib.path(sk.skill_id)) + out["how_to_reuse"] = ( + "USE: copy the source into your final_script and fill THIS task's params; " + "ADAPT: reuse its login/navigation/extraction core, change ONLY the final step." + ) + return out + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="python -m webwright.tools.skill_use", + description="Query the skill library for a reusable skill for the current task.", + ) + p.add_argument("--task", required=True, help="The current task description / intent.") + p.add_argument("--library", default=os.environ.get("SKILL_LIBRARY_ROOT", "library"), + help="Path to the skill library dir (default: $SKILL_LIBRARY_ROOT or ./library).") + p.add_argument("--output", default="", help="Write JSON to this path instead of stdout.") + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + try: + result = recommend(args.task, args.library) + except Exception as exc: # never block solving + result = {"verdict": "skip", "skill_id": None, "reason": f"skill_use error: {exc}"} + payload = json.dumps(result, ensure_ascii=False, indent=2) + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(payload) + print(payload) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/skills/test_evolve.py b/tests/skills/test_evolve.py new file mode 100644 index 0000000..ad4189c --- /dev/null +++ b/tests/skills/test_evolve.py @@ -0,0 +1,49 @@ +"""Unit test: evolve (growing library, usage-driven). Stubs _refine to stay LLM-free.""" +import sys, tempfile +from pathlib import Path +pass +import webwright.skills.update as U +from webwright.skills.library import Library, Skill + + +def run(): + # stub _refine: deterministically "build/widen" a skill for the group's template + def fake_refine(group, library): + from webwright.skills.update import _slug + sid = _slug(group[0].template) + library.add(Skill(sid, f"# refined from {len(group)} solves\n", + {"template": group[0].template, "provenance": "test-refine"})) + return [sid] + U._refine = fake_refine + + with tempfile.TemporaryDirectory() as d: + lib = Library(d) + + # round 1: template T1 not in lib, skip verdict -> ADD + t1 = [U.Trace("T1", "code", verdict="skip", correct=True), + U.Trace("T1", "code", verdict="skip", correct=True)] + log1 = U.evolve(t1, lib) + assert log1["added"], f"new template should be added: {log1}" + assert len(lib.list()) == 1 + + # round 2: T1 now exists, all USE success -> library unchanged + t2 = [U.Trace("T1", "code", used_skill_id="t1", verdict="use", correct=True)] + log2 = U.evolve(t2, lib) + assert log2["use"] and not log2["added"] and not log2["adapt_refined"], log2 + assert len(lib.list()) == 1, "pure use must not change library" + + # round 3: T1 exists, an ADAPT happened -> refine back (widen) + t3 = [U.Trace("T1", "code2", used_skill_id="t1", verdict="adapt", correct=True)] + log3 = U.evolve(t3, lib) + assert log3["adapt_refined"], f"adapt should refine back: {log3}" + + # wrong solves are dropped (not fed to refine) + t4 = [U.Trace("T2", "bad", verdict="skip", correct=False)] + log4 = U.evolve(t4, lib) + assert log4["dropped_wrong"] == 1 and not log4["added"], log4 + + print("test_evolve OK") + + +if __name__ == "__main__": + run() diff --git a/tests/skills/test_gate.py b/tests/skills/test_gate.py new file mode 100644 index 0000000..6240b3b --- /dev/null +++ b/tests/skills/test_gate.py @@ -0,0 +1,35 @@ +"""Unit test: admission gate (deterministic, no external).""" +import sys +from pathlib import Path +pass +from webwright.skills.gate import gate + +ARR = {"type": "array", "items": {"type": "string"}} + + +def run(): + # self_verify: reject null / empty, admit non-empty + assert gate(None, method="self_verify").admit is False + assert gate([], method="self_verify").admit is False + assert gate("", method="self_verify").admit is False + assert gate(["Sprite"], method="self_verify").admit is True + + # self_verify: shape must match output_schema + assert gate(["a", "b"], output_schema=ARR, method="self_verify").admit is True + assert gate({"x": 1}, output_schema=ARR, method="self_verify").admit is False, "dict != array schema" + + # gold: admit iff equal + assert gate(["Sprite"], gold=["Sprite"], method="gold").admit is True + assert gate(["Pepsi"], gold=["Sprite"], method="gold").admit is False + + # auto: gold present -> use gold; absent -> self_verify + assert gate(["Sprite"], gold=["Sprite"]).admit is True # auto+gold -> match + assert gate(["Pepsi"], gold=["Sprite"]).admit is False # auto+gold -> mismatch -> reject + assert gate(["anything"]).admit is True # auto, no gold -> self_verify pass + assert gate(None).admit is False # auto, no gold -> self_verify fail + + print("test_gate OK") + + +if __name__ == "__main__": + run() diff --git a/tests/skills/test_library.py b/tests/skills/test_library.py new file mode 100644 index 0000000..174c6c3 --- /dev/null +++ b/tests/skills/test_library.py @@ -0,0 +1,33 @@ +"""Unit test: library store (deterministic, no LLM).""" +import sys, tempfile +from pathlib import Path +pass +from webwright.skills.library import Library, Skill + + +def run(): + with tempfile.TemporaryDirectory() as d: + lib = Library(d) + assert lib.list() == [], "empty library should list nothing" + assert lib.get("nope") is None, "missing skill -> None" + + sk = Skill(skill_id="s1", code="print('hi')\n", + meta={"template": "do {x}", "summary": "does x", "signature": {"params": ["x"]}}) + lib.add(sk) + + got = lib.get("s1") + assert got is not None and got.code == "print('hi')\n", "get returns added code" + assert got.meta["template"] == "do {x}" + assert got.summary == "does x" + assert got.signature["params"] == ["x"] + assert [s.skill_id for s in lib.list()] == ["s1"], "list shows added skill" + assert lib.path("s1").name == "skill.py" and lib.path("s1").exists() + + # re-open from disk -> persisted + lib2 = Library(d) + assert [s.skill_id for s in lib2.list()] == ["s1"], "persisted across re-open" + print("test_library OK") + + +if __name__ == "__main__": + run() diff --git a/tests/skills/test_retrieve_decide.py b/tests/skills/test_retrieve_decide.py new file mode 100644 index 0000000..4bf032a --- /dev/null +++ b/tests/skills/test_retrieve_decide.py @@ -0,0 +1,40 @@ +"""Unit test: deterministic parts of retrieve/decide (no LLM). +The LLM paths are smoke-tested in test_front.py.""" +import sys, tempfile +from pathlib import Path +pass +from webwright.skills.library import Library, Skill +from webwright.skills.retrieve import retrieve, Candidate +from webwright.skills.decide import decide, Decision + + +def _lib(d): + lib = Library(d) + lib.add(Skill("bestsellers", "x", {"template": "Get the top best-selling product in period", + "summary": "magento bestsellers report"})) + lib.add(Skill("reviews", "x", {"template": "Get reviewers who mention something", + "summary": "product page reviews"})) + return lib + + +def run(): + with tempfile.TemporaryDirectory() as d: + lib = _lib(d) + + # retrieve(method="simple"): keyword overlap, deterministic + cands = retrieve("top best-selling product", lib, method="simple") + assert cands, "simple retrieve should find the bestsellers skill" + assert cands[0].skill.skill_id == "bestsellers", "most-overlapping skill ranked first" + + cands2 = retrieve("zzz nonsense quux", lib, method="simple") + assert cands2 == [], "no overlap -> no candidates" + + # decide with no candidates -> skip (deterministic, no LLM) + d0 = decide("anything", []) + assert isinstance(d0, Decision) and d0.verdict == "skip" and d0.skill_id is None + + print("test_retrieve_decide OK") + + +if __name__ == "__main__": + run() diff --git a/tests/skills/test_update.py b/tests/skills/test_update.py new file mode 100644 index 0000000..2abfffe --- /dev/null +++ b/tests/skills/test_update.py @@ -0,0 +1,36 @@ +"""Unit test: update 'grow' (deterministic, no LLM).""" +import sys, tempfile +from pathlib import Path +pass +from webwright.skills.library import Library +from webwright.skills.update import update, Trace + + +def run(): + with tempfile.TemporaryDirectory() as d: + lib = Library(d) + assert lib.list() == [] + + traces = [ + Trace(template="Get the top-{n} best-selling {entity} in {period}", code="print(1)", + answer=["Sprite"], meta={"site": "shopping_admin"}), + Trace(template="Get the reviewers who mention {x}", code="print(2)", answer=["Bob"]), + ] + added = update(traces, lib, method="grow") + assert len(added) == 2, "two new templates -> two skills added" + assert len(lib.list()) == 2, "library grew to 2" + + # idempotent: same templates again -> nothing new + added2 = update(traces, lib, method="grow") + assert added2 == [], "already-covered templates not re-added" + assert len(lib.list()) == 2, "library unchanged" + + # the distilled skill carries its code + provenance + s = lib.get(added[0]) + assert s.code == "print(1)" and s.meta["provenance"] == "distilled" + + print("test_update OK") + + +if __name__ == "__main__": + run() From 91a2090a115a0ffb62f72ba63a94ff27852d4fe5 Mon Sep 17 00:00:00 2001 From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> Date: Tue, 30 Jun 2026 06:12:30 +0000 Subject: [PATCH 2/6] skills: prompt hint helper + skill_mode overlay; env-based model for skill_use CLI - skills/prompt.with_skill_hint: prepend skill-library usage hint to task prompt (non-invasive; webwright merges system_template by replacement, so prompt-level is the clean way) - config/skill_mode.yaml: optional overlay doc + step budget for skill-reuse runs - llm._model(): bare CLI (python -m webwright.tools.skill_use) builds model from SKILL_MODEL_NAME/ENDPOINT (or OPENAI_*) env -> same backend as agent, no hardcoded gateway Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> --- src/webwright/config/skill_mode.yaml | 13 +++++++++++++ src/webwright/skills/__init__.py | 3 ++- src/webwright/skills/prompt.py | 26 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 src/webwright/config/skill_mode.yaml create mode 100644 src/webwright/skills/prompt.py diff --git a/src/webwright/config/skill_mode.yaml b/src/webwright/config/skill_mode.yaml new file mode 100644 index 0000000..b8d6158 --- /dev/null +++ b/src/webwright/config/skill_mode.yaml @@ -0,0 +1,13 @@ +# Skill-library mode (optional overlay): raise the step budget headroom for skill reuse runs. +# Enable by stacking: webwright run ... -c base.yaml -c model_openai.yaml -c skill_mode.yaml +# +# How the agent is told to reuse skills: the SKILL-LIBRARY block is prepended to the TASK prompt +# by the caller (see webwright.skills.prompt.with_skill_hint), NOT injected via system_template — +# webwright merges system_template by replacement, so a prompt-level hint is the clean, non-invasive +# way and keeps webwright's default behavior unchanged when this mode is off. +# +# Requires env: +# SKILL_LIBRARY_ROOT path to the skill library (read by the skill_use tool) +# SKILL_MODEL_NAME / SKILL_MODEL_ENDPOINT (optional) backend for skill_use; defaults to OPENAI_* +agent: + step_limit: 100 diff --git a/src/webwright/skills/__init__.py b/src/webwright/skills/__init__.py index 6f3383f..a4d119d 100644 --- a/src/webwright/skills/__init__.py +++ b/src/webwright/skills/__init__.py @@ -14,8 +14,9 @@ from .gate import gate, GateResult from .update import evolve, Trace # NOTE: don't import the `update` function here — it would from .llm import configure_llm # shadow the `update` submodule. Use update.evolve / update.update. +from .prompt import with_skill_hint __all__ = [ "Library", "Skill", "retrieve", "Candidate", "decide", "Decision", - "gate", "GateResult", "evolve", "Trace", "configure_llm", + "gate", "GateResult", "evolve", "Trace", "configure_llm", "with_skill_hint", ] diff --git a/src/webwright/skills/prompt.py b/src/webwright/skills/prompt.py new file mode 100644 index 0000000..5497b1c --- /dev/null +++ b/src/webwright/skills/prompt.py @@ -0,0 +1,26 @@ +"""Prompt helper: prepend a SKILL-LIBRARY hint to a task prompt so the agent reuses the library. + +Kept at the prompt level (not system_template) because webwright merges system_template by +replacement; a task-prompt hint is non-invasive and leaves default behavior unchanged when unused. +""" +from __future__ import annotations + +_HINT = """## Skill library (reuse before solving from scratch) +A library of previously-built executable code skills may contain one that helps this task. +BEFORE planning from scratch, query it ONCE from bash: + + python -m webwright.tools.skill_use --task "{task}" --library "{library}" + +It returns JSON {{verdict, skill_id, source_path, how_to_reuse}}: +- "use" : read source_path, copy it into your final_script, fill THIS task's params. +- "adapt" : read source_path, reuse its login/navigation/extraction core, change ONLY the last step. +- "skip" : no useful skill — solve from scratch. +Record your choice in skill_decision.json ({{skill_id, verdict, reason}}) before acting. + +--- +""" + + +def with_skill_hint(task_prompt: str, *, task: str, library: str) -> str: + """Prepend the skill-library hint to a task prompt.""" + return _HINT.format(task=task.replace('"', "'"), library=library) + task_prompt From a96336b44875ebeebc9e7403e1b5f4778762ae22 Mon Sep 17 00:00:00 2001 From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:06:18 +0000 Subject: [PATCH 3/6] skills: README for the module + env-based model for bare CLI - README: what the module is, the two plug points (skill_use tool + update CLI), components table, gate semantics, backend config, results summary - llm._model(): bare CLI builds model from SKILL_MODEL_NAME/ENDPOINT (or OPENAI_*) env Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> --- src/webwright/skills/README.md | 76 ++++++++++++++++++++++++++++++++++ src/webwright/skills/llm.py | 14 ++++++- 2 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 src/webwright/skills/README.md diff --git a/src/webwright/skills/README.md b/src/webwright/skills/README.md new file mode 100644 index 0000000..6ca27a7 --- /dev/null +++ b/src/webwright/skills/README.md @@ -0,0 +1,76 @@ +# `webwright.skills` — a memory / skill-library module for Webwright + +Turn solved tasks into **reusable, executable code skills**, retrieve and judge them at solve +time, gate what enters the library, and grow the library incrementally. A self-evolving loop: + +``` +solve --(gate: gold | self_verify)--> admit --(evolve: refine + parameterize + primitives)--> library + ^ | + | skill_use tool: retrieve + decide (use / adapt / skip) <--------------------------------------- + +``` + +This is the missing **reuse + accumulation** layer: Webwright already turns a task into a +parameterized script (`crafted_cli`); this module accumulates those across tasks, judges when a +prior skill applies, and improves skills as more solves arrive — with a gate so wrong solves don't +pollute the library. + +## How it plugs into Webwright + +Two touch points, **no change to the agent loop or default config**: + +1. **Reuse at solve time — the `skill_use` tool.** The agent invokes it from bash, exactly like + `self_reflection` / `image_qa`: + ```bash + python -m webwright.tools.skill_use --task "" --library "$SKILL_LIBRARY_ROOT" + ``` + It returns JSON `{verdict: use|adapt|skip, skill_id, source_path, how_to_reuse}`. The agent + reads `source_path` and reuses the skill (use = as-is, adapt = reuse core + change last step, + skip = solve from scratch). `webwright.skills.with_skill_hint(prompt, ...)` prepends a one-line + usage hint to the task prompt so the agent remembers to query the library first. + +2. **Growth after solving — the `update` CLI.** Distill a batch of gate-passed solves into a + library skill (offline, not in the solve loop): + ```bash + python -m webwright.skills.update --manifest batch.json --library ./library + ``` + `batch.json = {"template": "...", "runs": [{"dir","admit","params", ...}, ...]}`. + +## Components + +| file | role | +|---|---| +| `library.py` | `Skill` + `Library(root)`: on-disk skills (`/skill.py` + `meta.json`) | +| `retrieve.py` | `retrieve(task, library)` → ranked `Candidate`s (relevance) | +| `decide.py` | `decide(task, candidates)` → `Decision(verdict, skill_id, reason)` (utility: use/adapt/skip) | +| `gate.py` | `gate(result, method=gold\|self_verify\|none)` → admit? (keeps wrong solves out) | +| `update.py` | `evolve(traces, library)`: grow on the existing library — add / adapt-refine / keep; `_refine` parameterizes + decomposes into primitives, incrementally improving an existing skill | +| `llm.py` | `configure_llm(model)` + `llm()`: **backend-agnostic** via Webwright's `Model` abstraction; a bare CLI builds the model from `SKILL_MODEL_NAME`/`SKILL_MODEL_ENDPOINT` (or `OPENAI_*`) env — no hardcoded endpoint/key | +| `prompt.py` | `with_skill_hint(prompt, task, library)`: non-invasive task-prompt hint | + +## Gate + +`gate(method=...)` is the **admission** check (independent of the solving agent — not the same as +`self_reflection`, which is the agent's own completion condition): +- `gold` — compare against a known answer (benchmarks); strongest. +- `self_verify` — invariant only (non-empty + shape); weak placeholder when no gold exists. It does + not check *correctness*. Note: `self_reflection` cannot serve as the gate — `require_self_reflection_success` + makes it always `predicted_label==1`, so it would admit everything (agent grading itself). +- `none` — admit all (demos). + +## Backend + +Backend-agnostic. Either `configure_llm(model_config_or_Model)` once in-process, or set +`SKILL_MODEL_NAME` / `SKILL_MODEL_ENDPOINT` (falling back to `OPENAI_*`) so a bare tool invocation +uses the same backend as the running agent. No gateway or key is hardcoded. + +## Results (summary) + +Validated with this module (full data + analysis live in the companion research repo, not here): +- **Real website (public GitHub, read-only):** end-to-end loop works — two repos solved from + scratch → `update` distilled a parameterized skill → a held-out repo solved by reusing it + (agent called `skill_use`, verdict `use`, answer correct). +- **Incremental growth:** a second batch improves the existing skill in place (keeps the working + functions, adds robustness) rather than rewriting it. +- **Gate prevents pollution:** a wrong solve is dropped and never enters the library. +- **Reuse value is task-dependent:** step savings are modest on easy tasks (query overhead ≈ the + exploration it saves) and larger on harder tasks with more exploration to skip. diff --git a/src/webwright/skills/llm.py b/src/webwright/skills/llm.py index 2e0703e..278694a 100644 --- a/src/webwright/skills/llm.py +++ b/src/webwright/skills/llm.py @@ -25,8 +25,18 @@ def configure_llm(model: Any) -> None: def _model() -> Any: if _DEFAULT_MODEL is not None: return _DEFAULT_MODEL - # default: openai model from env (OPENAI_API_KEY / OPENAI_BASE_URL respected by the model class) - return get_model({"model_class": "openai"}) + # default: build an openai-style model from env so a bare CLI invocation + # (e.g. `python -m webwright.tools.skill_use`) uses the SAME backend as the running agent. + # Honors SKILL_MODEL_NAME / SKILL_MODEL_ENDPOINT (or OPENAI_* fallbacks); no hardcoded gateway. + import os + cfg = {"model_class": os.environ.get("SKILL_MODEL_CLASS", "openai")} + name = os.environ.get("SKILL_MODEL_NAME") or os.environ.get("OPENAI_MODEL") + endpoint = os.environ.get("SKILL_MODEL_ENDPOINT") or os.environ.get("OPENAI_ENDPOINT") + if name: + cfg["model_name"] = name + if endpoint: + cfg["openai_endpoint"] = endpoint + return get_model(cfg) def llm(system: str, user: str, *, model: Any = None, **_: Any) -> str: From 82fe0ba5a40437d23ce691a1710abb2e345bc7ce Mon Sep 17 00:00:00 2001 From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:06:52 +0000 Subject: [PATCH 4/6] docs+tests: README skill-library section + skills unit tests - README: Skill Library section (what it is, reuse via skill_use tool, grow via update CLI, end-to-end validation summary) - tests/skills: 5 unit tests for library/gate/update/evolve/retrieve+decide Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 891bbd7..042c61f 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,25 @@ python assets/task_showcase/app.py \ --- +## 🧠 Skill Library (reuse solved tasks across tasks) + +[`webwright.skills`](src/webwright/skills/) turns solved tasks into **reusable, executable code +skills**, retrieves and judges them at solve time, gates what enters the library, and grows the +library incrementally — a self-evolving *store → retrieve → use/adapt → gate → evolve* loop on top +of Webwright's code-as-action solves. Plugs in with **no change to the agent loop**: + +- **Reuse** — the agent calls `python -m webwright.tools.skill_use --task "..." --library ...` + (like `self_reflection`/`image_qa`); it returns `{verdict: use|adapt|skip, source_path}`. +- **Grow** — `python -m webwright.skills.update --manifest batch.json --library ./library` + distills a batch of gate-passed solves into a parameterized, primitive-decomposed skill. + +Validated end-to-end on a real public website (read-only GitHub): solve two repos from scratch → +`update` builds a parameterized skill → a held-out repo is solved by reusing it (agent calls +`skill_use`, verdict `use`, answer correct); a wrong solve is kept out by the gate; a second batch +improves the existing skill in place. See [`src/webwright/skills/README.md`](src/webwright/skills/README.md). + +--- + ## 🚀 Quick Start ### Prerequisites From 05aab3a081eef593da77bbaa179f7e7038de06d9 Mon Sep 17 00:00:00 2001 From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:18:12 +0000 Subject: [PATCH 5/6] skills: translate all comments/docstrings to English Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> --- src/webwright/skills/decide.py | 6 ++-- src/webwright/skills/gate.py | 28 +++++++++++------- src/webwright/skills/retrieve.py | 8 +++-- src/webwright/skills/update.py | 51 ++++++++++++++++++-------------- 4 files changed, 53 insertions(+), 40 deletions(-) diff --git a/src/webwright/skills/decide.py b/src/webwright/skills/decide.py index eaddc8d..a966adb 100644 --- a/src/webwright/skills/decide.py +++ b/src/webwright/skills/decide.py @@ -1,8 +1,8 @@ -"""判断用不用:候选 + 任务 → use / adapt / skip(utility)。 +"""Decide whether to use: candidates + task -> use / adapt / skip (utility). -接口稳定(实现可换): +Stable interface (swappable implementation): decide(task, candidates, *, method="llm") -> Decision -相关 ≠ 有用:retrieve 给"像不像",decide 给"该不该用、怎么用"。 +Relevant != useful: retrieve gives "how similar", decide gives "whether and how to use it". """ from __future__ import annotations from dataclasses import dataclass diff --git a/src/webwright/skills/gate.py b/src/webwright/skills/gate.py index 4ebc43e..0e9baac 100644 --- a/src/webwright/skills/gate.py +++ b/src/webwright/skills/gate.py @@ -1,17 +1,23 @@ -"""准入闸:只有"对的"解/技能才准进库,防 correct-but-narrow / regression 污染。 -gate 是【独立第二只眼】,与解题 agent 自己的 self_reflection 不同(后者是解题完成条件)。 +"""Admission gate: only "correct" solves/skills enter the library, preventing correct-but-narrow / +regression pollution. The gate is an INDEPENDENT second eye — distinct from the solving agent's own +self_reflection (which is a solve-completion condition, not an admission check). -接口稳定(实现可换),method 可配置: +Stable interface (swappable implementation), configurable method: gate(result, *, gold=None, output_schema=None, method="auto") -> GateResult -- method="gold" : 与 gold 比对(WebArena 等有标准答案;真独立、能挡住抽错的解)。★推荐 -- method="self_verify" : 不变量(result 非空 + shape 合 output_schema)。无 gold 时的弱占位。 - ⚠️ 局限:只查"有没有/形状对不对",不查"对不对"——抽错但非空的答案照样放行。 - (注:webwright 的 self_reflection 因 require_self_reflection_success 而恒为 - predicted_label==1,故不能用它当 gate;那是解题完成条件,非独立准入。) -- method="none" : 不把关(纯演示复用,不防污染)。 -- method="auto" : 有 gold 用 gold,否则 self_verify。 -升级路径(next step):真实站用 WebJudge(OM2W 官方 judge)或跨源一致核验,做真独立把关。 +- method="gold" : compare against gold (benchmarks like WebArena; truly independent, catches + mis-extracted solves). Recommended. +- method="self_verify" : invariant only (result non-empty + shape matches output_schema). A weak + placeholder when no gold exists. + Limitation: only checks "present / right shape", not "correct" — a wrong but + non-empty answer is admitted anyway. + (Note: webwright's self_reflection is always predicted_label==1 due to + require_self_reflection_success, so it cannot serve as the gate — that is a + solve-completion condition, not independent admission.) +- method="none" : no gate (demo reuse only, no pollution protection). +- method="auto" : use gold if available, else self_verify. +Upgrade path (next step): for real websites use WebJudge (OM2W's official judge) or cross-source +consistency checks for a truly independent gate. """ from __future__ import annotations from dataclasses import dataclass diff --git a/src/webwright/skills/retrieve.py b/src/webwright/skills/retrieve.py index fae5d1e..5f7a405 100644 --- a/src/webwright/skills/retrieve.py +++ b/src/webwright/skills/retrieve.py @@ -1,8 +1,9 @@ -"""取:任务 → 最相关的候选技能(relevance)。 +"""Retrieve: task -> most relevant candidate skills (relevance only). -接口稳定(实现可换): +Stable interface (swappable implementation): retrieve(task, library, *, k=3, method="llm") -> [Candidate] -MVP: 单次 LLM 调用,把整库当 flat catalog 列进 prompt 让它选。库大了换 embedding,接口不变。 +MVP: a single LLM call that lists the whole library as a flat catalog in the prompt and lets it +pick. Swap to embeddings when the library grows large — the interface stays the same. """ from __future__ import annotations from dataclasses import dataclass @@ -15,6 +16,7 @@ class Candidate: skill: Skill score: float # relevance 0..1 + reason: str diff --git a/src/webwright/skills/update.py b/src/webwright/skills/update.py index bcb4a80..61101fa 100644 --- a/src/webwright/skills/update.py +++ b/src/webwright/skills/update.py @@ -1,11 +1,13 @@ -"""沉淀(间歇):把通过 gate 的解蒸馏、写回 library,让库从使用中长大。 +"""Sediment (intermittent): distill gate-passed solves back into the library so it grows from use. -接口稳定(实现可换): - update(traces, library, *, method="grow") -> [被加/更新的 skill_id] +Stable interface (swappable implementation): + update(traces, library, *, method="grow") -> [added/updated skill_ids] -- method="grow" : 库里还没覆盖这个 template 的,就把这条成功解原样提升为技能(最小形态)。 -- method="refine" : 批量提炼——对齐 N 个 gate 过的解 → 参数化(泛化) + 拆出可复用 primitive + 薄任务层 - → 一个更好的库技能。这是"update 加泛化性 + primitive 复用性"的实现(单次 LLM 批量调用)。 +- method="grow" : if the library does not yet cover this template, promote the successful solve + as-is into a skill (minimal form). +- method="refine" : batch distillation — align N gate-passed solves -> parameterize (generalize) + + factor out reusable primitives + a thin task layer -> one better library skill. + This is where update adds generalization + primitive reusability (one batched LLM call). """ from __future__ import annotations import json @@ -19,10 +21,10 @@ @dataclass class Trace: template: str - code: str # 这条任务的 final_script(已过 gate = 正确) + code: str # this task's final_script (already gate-passed = correct) answer: object = None meta: dict = field(default_factory=dict) # params / site / start_url / output_schema ... - # usage:这条任务是怎么用库的(驱动 update 的信号) + # usage: how this task used the library (the signal that drives update) used_skill_id: str | None = None verdict: str | None = None # use | adapt | skip correct: bool = True @@ -69,14 +71,15 @@ def _extract_code(txt: str) -> str: def _refine(traces: list[Trace], library: Library) -> list[str]: - """批量提炼:对齐 N 个 gate 过的解 → 参数化 + primitive。 - 增量:若库里已有同 template 技能,则在【现有技能基础上】改进/加宽(而非从原始解重写)。""" + """Batch distillation: align N gate-passed solves -> parameterize + primitives. + Incremental: if a skill for the same template already exists, improve/widen it on top of the + existing skill (rather than rewriting from the raw solves).""" if not traces: return [] template = traces[0].template schema = traces[0].meta.get("output_schema") sid = _slug(template) - existing = library.get(sid) # 已有技能?→ 增量演化 + existing = library.get(sid) # skill already exists? -> incremental evolution blocks = [f"## Template\n{template}\n\n## Required output_schema for retrieved_data\n{json.dumps(schema)}\n"] if existing and existing.code: @@ -120,21 +123,23 @@ def _grow(traces: list[Trace], library: Library) -> list[str]: def evolve(traces: list[Trace], library: Library) -> dict: - """统一 update:在【已有库】上,按每条轨迹的 usage(use/adapt/skip)决定怎么改库。 - 这是"可持续增长的库"的核心——不是每次从零建,而是在 v_{n-1} 上长出 v_n。 + """Unified update: evolve the EXISTING library, deciding per trace's usage (use/adapt/skip) how + to change it. This is the core of a continuously-growing library — not rebuilt from scratch each + time, but grown from v_{n-1} into v_n. - - USE 成功的轨迹:技能够好,不动库(只是复用证据)。 - - ADAPT 成功的轨迹:复用了核心、fix 了末端 → 把这批 fix 后的解【提炼回该 template 的技能】 - (加宽/更稳)。这就是"fix 沉淀进库"。 - - SKIP / 库没覆盖:该 template 还没有技能 → 用这批解【新增】一个技能。 + - USE (successful) : the skill is good enough, leave it untouched (just reuse evidence). + - ADAPT (successful) : core reused, last step fixed -> refine this batch's fixed solves + back into the template's skill (widen/harden). This is how a fix + sediments into the library. + - SKIP / not yet covered : the template has no skill yet -> add one from this batch. - 只吃 gate 过(correct=True)的轨迹(防污染)。返回一份 changelog。 + Only consumes gate-passed (correct=True) traces (pollution protection). Returns a changelog. """ good = [t for t in traces if t.correct] changelog = {"use": [], "adapt_refined": [], "added": [], "dropped_wrong": len(traces) - len(good)} existing_templates = {s.meta.get("template"): s.skill_id for s in library.list()} - # 按 template 分组(同族解一起提炼/沉淀) + # group by template (same-family solves are distilled/sedimented together) by_tmpl: dict[str, list[Trace]] = {} for t in good: by_tmpl.setdefault(t.template, []).append(t) @@ -142,15 +147,15 @@ def evolve(traces: list[Trace], library: Library) -> dict: for tmpl, group in by_tmpl.items(): verdicts = {t.verdict for t in group} if tmpl not in existing_templates: - # 库没覆盖 → 新增(用这批解提炼出一个技能) + # not covered -> add (distill a skill from this batch) sid = _refine(group, library)[0] changelog["added"].append(sid) elif "adapt" in verdicts: - # 有 fix 发生 → 把 fix 后的解重新提炼回该技能(加宽/更稳) - sid = _refine(group, library)[0] # _refine 用同 slug,覆盖加宽 + # a fix happened -> refine the fixed solves back into the skill (widen/harden) + sid = _refine(group, library)[0] # _refine uses the same slug, overwrites + widens changelog["adapt_refined"].append(sid) else: - # 全 use 成功 → 技能够好,不动 + # all use-success -> skill is good enough, leave it changelog["use"].append(existing_templates[tmpl]) return changelog From bb0d0cde7b5f346582f0590b0bcb9b96051fa2e5 Mon Sep 17 00:00:00 2001 From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:22:07 +0000 Subject: [PATCH 6/6] skills: drop dead code in update (grow path superseded by evolve) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove _grow / update() / _UPDATERS dispatch — evolve() is the single entry now; drop the test_update test that exercised the removed grow path. Keep retrieve/llm fallbacks (useful). Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com> --- src/webwright/skills/__init__.py | 4 ++-- src/webwright/skills/update.py | 21 ------------------- tests/skills/test_update.py | 36 -------------------------------- 3 files changed, 2 insertions(+), 59 deletions(-) delete mode 100644 tests/skills/test_update.py diff --git a/src/webwright/skills/__init__.py b/src/webwright/skills/__init__.py index a4d119d..79a0bf3 100644 --- a/src/webwright/skills/__init__.py +++ b/src/webwright/skills/__init__.py @@ -12,8 +12,8 @@ from .retrieve import retrieve, Candidate from .decide import decide, Decision from .gate import gate, GateResult -from .update import evolve, Trace # NOTE: don't import the `update` function here — it would -from .llm import configure_llm # shadow the `update` submodule. Use update.evolve / update.update. +from .update import evolve, Trace +from .llm import configure_llm from .prompt import with_skill_hint __all__ = [ diff --git a/src/webwright/skills/update.py b/src/webwright/skills/update.py index 61101fa..e451859 100644 --- a/src/webwright/skills/update.py +++ b/src/webwright/skills/update.py @@ -108,20 +108,6 @@ def _refine(traces: list[Trace], library: Library) -> list[str]: return [sid] -def _grow(traces: list[Trace], library: Library) -> list[str]: - existing = {s.meta.get("template") for s in library.list()} - added = [] - for tr in traces: - if not tr.template or tr.template in existing: - continue - sid = _slug(tr.template) - meta = {"template": tr.template, "provenance": "distilled", **tr.meta} - library.add(Skill(skill_id=sid, code=tr.code, meta=meta)) - existing.add(tr.template) - added.append(sid) - return added - - def evolve(traces: list[Trace], library: Library) -> dict: """Unified update: evolve the EXISTING library, deciding per trace's usage (use/adapt/skip) how to change it. This is the core of a continuously-growing library — not rebuilt from scratch each @@ -160,13 +146,6 @@ def evolve(traces: list[Trace], library: Library) -> dict: return changelog -_UPDATERS = {"grow": _grow, "refine": _refine} - - -def update(traces, library: Library, *, method: str = "grow") -> list[str]: - return _UPDATERS[method](traces, library) - - # ---------- CLI: batch update via a manifest ---------- def traces_from_manifest(manifest: dict) -> list["Trace"]: """manifest = {"template": str, "runs": [{"dir","admit","params","answer"?,"verdict"?}, ...]}. diff --git a/tests/skills/test_update.py b/tests/skills/test_update.py deleted file mode 100644 index 2abfffe..0000000 --- a/tests/skills/test_update.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Unit test: update 'grow' (deterministic, no LLM).""" -import sys, tempfile -from pathlib import Path -pass -from webwright.skills.library import Library -from webwright.skills.update import update, Trace - - -def run(): - with tempfile.TemporaryDirectory() as d: - lib = Library(d) - assert lib.list() == [] - - traces = [ - Trace(template="Get the top-{n} best-selling {entity} in {period}", code="print(1)", - answer=["Sprite"], meta={"site": "shopping_admin"}), - Trace(template="Get the reviewers who mention {x}", code="print(2)", answer=["Bob"]), - ] - added = update(traces, lib, method="grow") - assert len(added) == 2, "two new templates -> two skills added" - assert len(lib.list()) == 2, "library grew to 2" - - # idempotent: same templates again -> nothing new - added2 = update(traces, lib, method="grow") - assert added2 == [], "already-covered templates not re-added" - assert len(lib.list()) == 2, "library unchanged" - - # the distilled skill carries its code + provenance - s = lib.get(added[0]) - assert s.code == "print(1)" and s.meta["provenance"] == "distilled" - - print("test_update OK") - - -if __name__ == "__main__": - run()