From 4ae9ab8768ab35b3b0a6cfb806972d14ca48da98 Mon Sep 17 00:00:00 2001
From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
Date: Tue, 30 Jun 2026 06:06:58 +0000
Subject: [PATCH 1/6] skills: add webwright.skills memory/skill-library module
 + skill_use tool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A built-in submodule turning solved tasks into reusable, executable code skills:
- skills/{library,retrieve,decide,gate,update,llm}: store / retrieve (relevance) /
  decide (use·adapt·skip utility) / admission gate (gold|self_verify|none) /
  evolve (incremental growth on existing library) — backend-agnostic via configure_llm
  over webwright's own Model abstraction (no hardcoded gateway/key/path)
- tools/skill_use.py: solve-time tool (agent invokes like self_reflection/image_qa) ->
  retrieve+decide -> JSON recommendation (use/adapt/skip + source path)
- python -m webwright.skills.update --manifest batch.json --library ./lib : batch growth
- tests/skills: 5 unit tests pass against the migrated module (logic == original)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/webwright/skills/__init__.py     |  21 +++
 src/webwright/skills/decide.py       |  50 +++++++
 src/webwright/skills/gate.py         |  62 ++++++++
 src/webwright/skills/library.py      |  59 ++++++++
 src/webwright/skills/llm.py          |  57 ++++++++
 src/webwright/skills/retrieve.py     |  75 ++++++++++
 src/webwright/skills/update.py       | 208 +++++++++++++++++++++++++++
 src/webwright/tools/skill_use.py     |  72 ++++++++++
 tests/skills/test_evolve.py          |  49 +++++++
 tests/skills/test_gate.py            |  35 +++++
 tests/skills/test_library.py         |  33 +++++
 tests/skills/test_retrieve_decide.py |  40 ++++++
 tests/skills/test_update.py          |  36 +++++
 13 files changed, 797 insertions(+)
 create mode 100644 src/webwright/skills/__init__.py
 create mode 100644 src/webwright/skills/decide.py
 create mode 100644 src/webwright/skills/gate.py
 create mode 100644 src/webwright/skills/library.py
 create mode 100644 src/webwright/skills/llm.py
 create mode 100644 src/webwright/skills/retrieve.py
 create mode 100644 src/webwright/skills/update.py
 create mode 100644 src/webwright/tools/skill_use.py
 create mode 100644 tests/skills/test_evolve.py
 create mode 100644 tests/skills/test_gate.py
 create mode 100644 tests/skills/test_library.py
 create mode 100644 tests/skills/test_retrieve_decide.py
 create mode 100644 tests/skills/test_update.py

diff --git a/src/webwright/skills/__init__.py b/src/webwright/skills/__init__.py
new file mode 100644
index 0000000..6f3383f
--- /dev/null
+++ b/src/webwright/skills/__init__.py
@@ -0,0 +1,21 @@
+"""webwright.skills — a memory/skill library module for webwright.
+
+Store solved tasks as reusable, executable code skills; retrieve + judge (use/adapt/skip) at
+solve time; admit via a gate; and grow the library incrementally (evolve). Plugs into webwright
+as a built-in submodule:
+  - solve-time reuse  : the `skill_use` tool (agent invokes it like self_reflection / image_qa)
+  - offline growth    : `update.evolve` (run after solves to distill gate-passed solves into skills)
+
+Backend-agnostic: configure_llm(model) wires it to any webwright Model.
+"""
+from .library import Library, Skill
+from .retrieve import retrieve, Candidate
+from .decide import decide, Decision
+from .gate import gate, GateResult
+from .update import evolve, Trace        # NOTE: don't import the `update` function here — it would
+from .llm import configure_llm           # shadow the `update` submodule. Use update.evolve / update.update.
+
+__all__ = [
+    "Library", "Skill", "retrieve", "Candidate", "decide", "Decision",
+    "gate", "GateResult", "evolve", "Trace", "configure_llm",
+]
diff --git a/src/webwright/skills/decide.py b/src/webwright/skills/decide.py
new file mode 100644
index 0000000..eaddc8d
--- /dev/null
+++ b/src/webwright/skills/decide.py
@@ -0,0 +1,50 @@
+"""判断用不用：候选 + 任务 → use / adapt / skip（utility）。
+
+接口稳定（实现可换）：
+    decide(task, candidates, *, method="llm") -> Decision
+相关 ≠ 有用：retrieve 给"像不像"，decide 给"该不该用、怎么用"。
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+
+from .llm import llm_json
+
+
+@dataclass
+class Decision:
+    verdict: str            # "use" | "adapt" | "skip"
+    skill_id: str | None
+    reason: str
+
+
+def _decide_llm(task: str, candidates) -> Decision:
+    if not candidates:
+        return Decision("skip", None, "no candidate skills")
+    cat = "\n".join(
+        f"- skill_id: {c.skill.skill_id} | template: {c.skill.meta.get('template','')} | "
+        f"summary: {c.skill.summary} | params: {c.skill.signature.get('params', [])}"
+        for c in candidates
+    )
+    sys = (
+        "Decide whether a library skill is worth using for THIS task. Output STRICT JSON: "
+        '{"verdict":"use|adapt|skip","skill_id":"...","reason":"..."}.\n'
+        "- use   = the skill fits the task as-is (just different parameter values).\n"
+        "- adapt = the skill's expensive core (login / navigation / extraction) is reusable, but the "
+        "FINAL step differs; the agent should reuse the front and add/adapt only the last step.\n"
+        "- skip  = no candidate is worth it; solve from scratch (skill_id = null).\n"
+        "Relevance is not enough — only 'use'/'adapt' if it genuinely saves work."
+    )
+    user = f"## Task\n{task}\n\n## Candidate skills (most relevant first)\n{cat}"
+    out = llm_json(sys, user)
+    verdict = out.get("verdict", "skip")
+    if verdict not in ("use", "adapt", "skip"):
+        verdict = "skip"
+    skill_id = out.get("skill_id") if verdict != "skip" else None
+    return Decision(verdict=verdict, skill_id=skill_id, reason=out.get("reason", ""))
+
+
+_DECIDERS = {"llm": _decide_llm}
+
+
+def decide(task: str, candidates, *, method: str = "llm") -> Decision:
+    return _DECIDERS[method](task, candidates)
diff --git a/src/webwright/skills/gate.py b/src/webwright/skills/gate.py
new file mode 100644
index 0000000..4ebc43e
--- /dev/null
+++ b/src/webwright/skills/gate.py
@@ -0,0 +1,62 @@
+"""准入闸：只有"对的"解/技能才准进库，防 correct-but-narrow / regression 污染。
+gate 是【独立第二只眼】，与解题 agent 自己的 self_reflection 不同（后者是解题完成条件）。
+
+接口稳定（实现可换），method 可配置：
+    gate(result, *, gold=None, output_schema=None, method="auto") -> GateResult
+
+- method="gold"        : 与 gold 比对（WebArena 等有标准答案；真独立、能挡住抽错的解）。★推荐
+- method="self_verify" : 不变量（result 非空 + shape 合 output_schema）。无 gold 时的弱占位。
+                         ⚠️ 局限：只查"有没有/形状对不对"，不查"对不对"——抽错但非空的答案照样放行。
+                         （注：webwright 的 self_reflection 因 require_self_reflection_success 而恒为
+                         predicted_label==1，故不能用它当 gate；那是解题完成条件，非独立准入。）
+- method="none"        : 不把关（纯演示复用，不防污染）。
+- method="auto"        : 有 gold 用 gold，否则 self_verify。
+升级路径（next step）：真实站用 WebJudge（OM2W 官方 judge）或跨源一致核验，做真独立把关。
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+
+
+@dataclass
+class GateResult:
+    admit: bool
+    reason: str
+
+
+def _shape_ok(result, output_schema) -> bool:
+    if not output_schema:
+        return True
+    t = output_schema.get("type")
+    if t == "array":
+        return isinstance(result, list)
+    if t == "object":
+        return isinstance(result, dict)
+    if t in ("string",):
+        return isinstance(result, str)
+    if t in ("number", "integer"):
+        return isinstance(result, (int, float)) and not isinstance(result, bool)
+    return True
+
+
+def _self_verify(result, output_schema) -> GateResult:
+    if result is None:
+        return GateResult(False, "result is null")
+    if isinstance(result, (list, dict, str)) and len(result) == 0:
+        return GateResult(False, "result is empty")
+    if not _shape_ok(result, output_schema):
+        return GateResult(False, f"shape != output_schema ({output_schema.get('type')})")
+    return GateResult(True, "self-verify passed (non-empty, shape ok)")
+
+
+def _gold(result, gold) -> GateResult:
+    if result == gold:
+        return GateResult(True, "matches gold")
+    return GateResult(False, "differs from gold")
+
+
+def gate(result, *, gold=None, output_schema=None, method: str = "auto") -> GateResult:
+    if method == "none":
+        return GateResult(True, "no gate (admit all)")
+    if method == "gold" or (method == "auto" and gold is not None):
+        return _gold(result, gold)
+    return _self_verify(result, output_schema)
diff --git a/src/webwright/skills/library.py b/src/webwright/skills/library.py
new file mode 100644
index 0000000..9bb6345
--- /dev/null
+++ b/src/webwright/skills/library.py
@@ -0,0 +1,59 @@
+"""Skill store. A skill = a directory under the library root holding skill.py + meta.json.
+
+Interface (stable — implementations behind it may change):
+    Library(root).list() -> [Skill]
+    Library(root).get(skill_id) -> Skill | None
+    Library(root).add(skill)            # write skill.py + meta.json
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class Skill:
+    skill_id: str
+    code: str                    # source of skill.py
+    meta: dict = field(default_factory=dict)   # {template, site, signature, summary, ...}
+
+    @property
+    def summary(self) -> str:
+        return self.meta.get("summary", "")
+
+    @property
+    def signature(self) -> dict:
+        return self.meta.get("signature", {})
+
+
+class Library:
+    def __init__(self, root: str | Path):
+        self.root = Path(root)
+        self.root.mkdir(parents=True, exist_ok=True)
+
+    def _dir(self, skill_id: str) -> Path:
+        return self.root / skill_id
+
+    def list(self) -> list[Skill]:
+        out = []
+        for d in sorted(self.root.iterdir()):
+            if (d / "meta.json").exists():
+                out.append(self.get(d.name))
+        return [s for s in out if s]
+
+    def get(self, skill_id: str) -> Skill | None:
+        d = self._dir(skill_id)
+        if not (d / "meta.json").exists():
+            return None
+        meta = json.loads((d / "meta.json").read_text())
+        code = (d / "skill.py").read_text() if (d / "skill.py").exists() else ""
+        return Skill(skill_id=skill_id, code=code, meta=meta)
+
+    def add(self, skill: Skill) -> None:
+        d = self._dir(skill.skill_id)
+        d.mkdir(parents=True, exist_ok=True)
+        (d / "skill.py").write_text(skill.code)
+        (d / "meta.json").write_text(json.dumps(skill.meta, ensure_ascii=False, indent=2))
+
+    def path(self, skill_id: str) -> Path:
+        return self._dir(skill_id) / "skill.py"
diff --git a/src/webwright/skills/llm.py b/src/webwright/skills/llm.py
new file mode 100644
index 0000000..2e0703e
--- /dev/null
+++ b/src/webwright/skills/llm.py
@@ -0,0 +1,57 @@
+"""LLM helper for the skills module — backend-agnostic, via webwright's own model abstraction.
+
+No hardcoded gateway/endpoint/key: the caller passes a webwright Model (or a model config dict),
+so this works with any backend webwright supports (openai / anthropic / openrouter / custom).
+"""
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Optional
+
+from webwright.models import get_model
+
+# Process-wide default model, set once via configure_llm() so retrieve/decide/update can call
+# llm() without each caller threading a Model through. Falls back to env-configured openai.
+_DEFAULT_MODEL: Optional[Any] = None
+
+
+def configure_llm(model: Any) -> None:
+    """Register the Model (or model-config dict) the skills module should use."""
+    global _DEFAULT_MODEL
+    _DEFAULT_MODEL = get_model(model) if isinstance(model, dict) else model
+
+
+def _model() -> Any:
+    if _DEFAULT_MODEL is not None:
+        return _DEFAULT_MODEL
+    # default: openai model from env (OPENAI_API_KEY / OPENAI_BASE_URL respected by the model class)
+    return get_model({"model_class": "openai"})
+
+
+def llm(system: str, user: str, *, model: Any = None, **_: Any) -> str:
+    """Single-turn call. Returns raw text. `model` overrides the configured default."""
+    m = model if model is not None else _model()
+    messages = [
+        m.format_message(role="system", content=system),
+        m.format_message(role="user", content=user),
+    ]
+    return m(messages)
+
+
+def llm_json(system: str, user: str, **kw: Any) -> dict:
+    """Call + parse the first {...} JSON object out of the reply."""
+    txt = llm(system, user, **kw)
+    match = re.search(r"\{.*\}", txt, re.S)
+    if not match:
+        return {}
+    try:
+        return json.loads(match.group(0))
+    except Exception:
+        s = match.group(0)
+        for end in range(len(s), 0, -1):
+            try:
+                return json.loads(s[:end])
+            except Exception:
+                continue
+    return {}
diff --git a/src/webwright/skills/retrieve.py b/src/webwright/skills/retrieve.py
new file mode 100644
index 0000000..fae5d1e
--- /dev/null
+++ b/src/webwright/skills/retrieve.py
@@ -0,0 +1,75 @@
+"""取：任务 → 最相关的候选技能（relevance）。
+
+接口稳定（实现可换）：
+    retrieve(task, library, *, k=3, method="llm") -> [Candidate]
+MVP: 单次 LLM 调用，把整库当 flat catalog 列进 prompt 让它选。库大了换 embedding，接口不变。
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+
+from .library import Library, Skill
+from .llm import llm_json
+
+
+@dataclass
+class Candidate:
+    skill: Skill
+    score: float          # relevance 0..1
+    reason: str
+
+
+def _catalog(library: Library) -> str:
+    lines = []
+    for s in library.list():
+        lines.append(
+            f"- skill_id: {s.skill_id}\n"
+            f"  template: {s.meta.get('template','')}\n"
+            f"  site: {s.meta.get('site','')}\n"
+            f"  summary: {s.summary}\n"
+            f"  params: {s.signature.get('params', [])}"
+        )
+    return "\n".join(lines)
+
+
+def _retrieve_llm(task: str, library: Library, k: int) -> list[Candidate]:
+    cat = _catalog(library)
+    if not cat:
+        return []
+    sys = (
+        "You match a web task to the most RELEVANT skills in a catalog (relevance only — not yet "
+        "whether to use them). Return STRICT JSON: "
+        '{"candidates":[{"skill_id":"...","score":<0..1>,"reason":"..."}]}, most relevant first, '
+        f"at most {k}. score = how relevant. If nothing is relevant, return an empty list."
+    )
+    user = f"## Task\n{task}\n\n## Skill catalog\n{cat}\n\nReturn at most {k} candidates."
+    out = llm_json(sys, user)
+    cands = []
+    for c in (out.get("candidates") or [])[:k]:
+        sk = library.get(c.get("skill_id", ""))
+        if sk:
+            try:
+                score = float(c.get("score", 0))
+            except Exception:
+                score = 0.0
+            cands.append(Candidate(skill=sk, score=score, reason=c.get("reason", "")))
+    return cands
+
+
+def _retrieve_simple(task: str, library: Library, k: int) -> list[Candidate]:
+    """No-LLM fallback: rank by keyword overlap between task and template/summary."""
+    toks = set(task.lower().split())
+    scored = []
+    for s in library.list():
+        bag = (s.meta.get("template", "") + " " + s.summary).lower().split()
+        overlap = len(toks & set(bag))
+        if overlap:
+            scored.append(Candidate(skill=s, score=overlap / (len(toks) or 1), reason="keyword overlap"))
+    scored.sort(key=lambda c: c.score, reverse=True)
+    return scored[:k]
+
+
+_RETRIEVERS = {"llm": _retrieve_llm, "simple": _retrieve_simple}
+
+
+def retrieve(task: str, library: Library, *, k: int = 3, method: str = "llm") -> list[Candidate]:
+    return _RETRIEVERS[method](task, library, k)
diff --git a/src/webwright/skills/update.py b/src/webwright/skills/update.py
new file mode 100644
index 0000000..bcb4a80
--- /dev/null
+++ b/src/webwright/skills/update.py
@@ -0,0 +1,208 @@
+"""沉淀（间歇）：把通过 gate 的解蒸馏、写回 library，让库从使用中长大。
+
+接口稳定（实现可换）：
+    update(traces, library, *, method="grow") -> [被加/更新的 skill_id]
+
+- method="grow"   : 库里还没覆盖这个 template 的，就把这条成功解原样提升为技能（最小形态）。
+- method="refine" : 批量提炼——对齐 N 个 gate 过的解 → 参数化(泛化) + 拆出可复用 primitive + 薄任务层
+                    → 一个更好的库技能。这是"update 加泛化性 + primitive 复用性"的实现（单次 LLM 批量调用）。
+"""
+from __future__ import annotations
+import json
+import re
+from dataclasses import dataclass, field
+
+from .library import Library, Skill
+from .llm import llm
+
+
+@dataclass
+class Trace:
+    template: str
+    code: str                       # 这条任务的 final_script（已过 gate = 正确）
+    answer: object = None
+    meta: dict = field(default_factory=dict)   # params / site / start_url / output_schema ...
+    # usage：这条任务是怎么用库的（驱动 update 的信号）
+    used_skill_id: str | None = None
+    verdict: str | None = None      # use | adapt | skip
+    correct: bool = True
+
+
+def _slug(template: str) -> str:
+    s = re.sub(r"[^a-z0-9]+", "_", template.lower()).strip("_")
+    return s[:48] or "skill"
+
+
+def _extract_code(txt: str) -> str:
+    m = re.search(r"```(?:python)?[ \t]*\n", txt)
+    if m:
+        end = txt.rfind("```")
+        if end > m.end():
+            return txt[m.end():end]
+    return txt
+
+
+_REFINE_SYS = (
+    "You are given N working Python solutions that EACH solve one concrete instance of the SAME web-task "
+    "template (they already passed a correctness gate). Distill them into ONE better library skill.\n"
+    "Do TWO things:\n"
+    "1) GENERALIZE: align the N solutions; the parts that are IDENTICAL across them are the reusable "
+    "skeleton; the parts that DIFFER are parameters. Expose the differing values as function "
+    "arguments / taskspec params — do NOT hardcode any instance's specific values. Make extraction "
+    "robust (paginate/until-done, self-verify against any declared total).\n"
+    "2) DECOMPOSE INTO REUSABLE PRIMITIVES: factor the expensive, reusable core into clearly-named "
+    "primitive functions (e.g. login(), open_report(period), extract_rows()), and keep a THIN task "
+    "layer on top that calls them. This lets future tasks reuse the primitives even if the final step "
+    "differs.\n"
+    "Interface (fixed): the skill reads taskspec.json from sys.argv[1] "
+    "(taskspec = {params, start_url, credentials, output_schema}) and writes agent_response.json with "
+    "retrieved_data MATCHING output_schema exactly. Output ONLY the python code in one ```python block."
+)
+
+_REFINE_INCREMENTAL = (
+    "\n\nINCREMENTAL MODE: a CURRENT library skill for this template already exists (shown below). "
+    "Do NOT rewrite it from scratch. START from the current skill and IMPROVE it using the NEW solutions: "
+    "keep its working primitives and structure, only widen/fix what the new solutions reveal (handle a "
+    "param value it missed, make an extraction more robust, fix a bug). Preserve everything that already "
+    "works. Output the full improved skill in one ```python block."
+)
+
+
+def _refine(traces: list[Trace], library: Library) -> list[str]:
+    """批量提炼：对齐 N 个 gate 过的解 → 参数化 + primitive。
+    增量：若库里已有同 template 技能，则在【现有技能基础上】改进/加宽（而非从原始解重写）。"""
+    if not traces:
+        return []
+    template = traces[0].template
+    schema = traces[0].meta.get("output_schema")
+    sid = _slug(template)
+    existing = library.get(sid)   # 已有技能？→ 增量演化
+
+    blocks = [f"## Template\n{template}\n\n## Required output_schema for retrieved_data\n{json.dumps(schema)}\n"]
+    if existing and existing.code:
+        blocks.append(f"## CURRENT library skill (improve THIS, do not rewrite)\n```python\n{existing.code}\n```")
+    label = "NEW solutions" if existing else "Solutions"
+    for i, tr in enumerate(traces):
+        blocks.append(
+            f"## {label} {i} (params={json.dumps(tr.meta.get('params'), ensure_ascii=False)}, "
+            f"answer={json.dumps(tr.answer, ensure_ascii=False)[:120]})\n```python\n{tr.code}\n```"
+        )
+    sys_prompt = _REFINE_SYS + (_REFINE_INCREMENTAL if existing else "")
+    code = _extract_code(llm(sys_prompt, "\n\n".join(blocks), max_tokens=16000, timeout=400))
+    n_prev = (existing.meta.get("n_solves", 0) if existing else 0)
+    meta = {
+        "template": template,
+        "provenance": "update-refined-incremental" if existing else "update-refined",
+        "site": traces[0].meta.get("site", ""),
+        "summary": f"Refined from {n_prev + len(traces)} gate-passed solves; parameterized + primitives.",
+        "signature": {"params": list((traces[0].meta.get("params") or {}).keys()),
+                      "call": "python skill.py taskspec.json"},
+        "output_schema": schema,
+        "n_solves": n_prev + len(traces),
+        "revisions": (existing.meta.get("revisions", 1) + 1) if existing else 1,
+    }
+    library.add(Skill(skill_id=sid, code=code, meta=meta))
+    return [sid]
+
+
+def _grow(traces: list[Trace], library: Library) -> list[str]:
+    existing = {s.meta.get("template") for s in library.list()}
+    added = []
+    for tr in traces:
+        if not tr.template or tr.template in existing:
+            continue
+        sid = _slug(tr.template)
+        meta = {"template": tr.template, "provenance": "distilled", **tr.meta}
+        library.add(Skill(skill_id=sid, code=tr.code, meta=meta))
+        existing.add(tr.template)
+        added.append(sid)
+    return added
+
+
+def evolve(traces: list[Trace], library: Library) -> dict:
+    """统一 update：在【已有库】上，按每条轨迹的 usage(use/adapt/skip)决定怎么改库。
+    这是"可持续增长的库"的核心——不是每次从零建，而是在 v_{n-1} 上长出 v_n。
+
+    - USE   成功的轨迹：技能够好，不动库（只是复用证据）。
+    - ADAPT 成功的轨迹：复用了核心、fix 了末端 → 把这批 fix 后的解【提炼回该 template 的技能】
+                        （加宽/更稳）。这就是"fix 沉淀进库"。
+    - SKIP / 库没覆盖：该 template 还没有技能 → 用这批解【新增】一个技能。
+
+    只吃 gate 过(correct=True)的轨迹（防污染）。返回一份 changelog。
+    """
+    good = [t for t in traces if t.correct]
+    changelog = {"use": [], "adapt_refined": [], "added": [], "dropped_wrong": len(traces) - len(good)}
+    existing_templates = {s.meta.get("template"): s.skill_id for s in library.list()}
+
+    # 按 template 分组（同族解一起提炼/沉淀）
+    by_tmpl: dict[str, list[Trace]] = {}
+    for t in good:
+        by_tmpl.setdefault(t.template, []).append(t)
+
+    for tmpl, group in by_tmpl.items():
+        verdicts = {t.verdict for t in group}
+        if tmpl not in existing_templates:
+            # 库没覆盖 → 新增（用这批解提炼出一个技能）
+            sid = _refine(group, library)[0]
+            changelog["added"].append(sid)
+        elif "adapt" in verdicts:
+            # 有 fix 发生 → 把 fix 后的解重新提炼回该技能（加宽/更稳）
+            sid = _refine(group, library)[0]   # _refine 用同 slug，覆盖加宽
+            changelog["adapt_refined"].append(sid)
+        else:
+            # 全 use 成功 → 技能够好，不动
+            changelog["use"].append(existing_templates[tmpl])
+    return changelog
+
+
+_UPDATERS = {"grow": _grow, "refine": _refine}
+
+
+def update(traces, library: Library, *, method: str = "grow") -> list[str]:
+    return _UPDATERS[method](traces, library)
+
+
+# ---------- CLI: batch update via a manifest ----------
+def traces_from_manifest(manifest: dict) -> list["Trace"]:
+    """manifest = {"template": str, "runs": [{"dir","admit","params","answer"?,"verdict"?}, ...]}.
+    Reads each run's final_script.py; builds a Trace. correct = the run's gate verdict (admit)."""
+    from pathlib import Path
+    template = manifest.get("template", "")
+    out = []
+    for r in manifest.get("runs", []):
+        d = Path(r["dir"])
+        fs = d / "final_script.py"
+        code = fs.read_text(encoding="utf-8") if fs.exists() else ""
+        answer = r.get("answer")
+        if answer is None and (d / "agent_response.json").exists():
+            try:
+                answer = json.load(open(d / "agent_response.json")).get("retrieved_data")
+            except Exception:
+                pass
+        out.append(Trace(template=template, code=code, answer=answer,
+                         correct=bool(r.get("admit", True)),
+                         verdict=r.get("verdict", "skip"),
+                         used_skill_id=r.get("used_skill_id"),
+                         meta={"params": r.get("params", {}), "site": r.get("site", ""),
+                               "start_url": r.get("start_url", ""),
+                               "output_schema": r.get("output_schema")}))
+    return out
+
+
+def main(argv=None) -> int:
+    import argparse
+    p = argparse.ArgumentParser(
+        prog="python -m webwright.skills.update",
+        description="Batch-update the skill library from a manifest of gate-judged solves.")
+    p.add_argument("--manifest", required=True, help="JSON: {template, runs:[{dir,admit,params,...}]}")
+    p.add_argument("--library", required=True, help="Path to the skill library directory.")
+    a = p.parse_args(argv)
+    manifest = json.load(open(a.manifest, encoding="utf-8"))
+    traces = traces_from_manifest(manifest)
+    changelog = evolve(traces, Library(a.library))
+    print(json.dumps(changelog, ensure_ascii=False, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/webwright/tools/skill_use.py b/src/webwright/tools/skill_use.py
new file mode 100644
index 0000000..9acc210
--- /dev/null
+++ b/src/webwright/tools/skill_use.py
@@ -0,0 +1,72 @@
+"""skill_use — solve-time tool: query the skill library for a reusable skill for THIS task.
+
+Like self_reflection / image_qa, the agent invokes this from bash during solving:
+
+    python -m webwright.tools.skill_use --task "Get the latest release version of facebook/react" \
+        --library "$WORKSPACE_DIR/../library"
+
+It retrieves the most relevant skill (relevance) and judges utility (use / adapt / skip), then
+prints a JSON recommendation telling the agent how to reuse it (and the path to read its source).
+The agent decides: reuse as-is (use), reuse the core and change only the last step (adapt), or
+solve from scratch (skip). Retrieval/judgement never block solving — on any error it prints skip.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+
+from webwright.skills.library import Library
+from webwright.skills.retrieve import retrieve
+from webwright.skills.decide import decide
+
+
+def recommend(task: str, library_root: str) -> dict:
+    lib = Library(library_root)
+    cands = retrieve(task, lib)
+    if not cands:
+        return {"verdict": "skip", "skill_id": None, "reason": "library has no relevant skill"}
+    d = decide(task, cands)
+    out = {"verdict": d.verdict, "skill_id": d.skill_id, "reason": d.reason}
+    if d.verdict != "skip" and d.skill_id:
+        sk = lib.get(d.skill_id)
+        if sk:
+            out["summary"] = sk.summary
+            out["call"] = sk.signature.get("call", "")
+            out["source_path"] = str(lib.path(sk.skill_id))
+            out["how_to_reuse"] = (
+                "USE: copy the source into your final_script and fill THIS task's params; "
+                "ADAPT: reuse its login/navigation/extraction core, change ONLY the final step."
+            )
+    return out
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="python -m webwright.tools.skill_use",
+        description="Query the skill library for a reusable skill for the current task.",
+    )
+    p.add_argument("--task", required=True, help="The current task description / intent.")
+    p.add_argument("--library", default=os.environ.get("SKILL_LIBRARY_ROOT", "library"),
+                   help="Path to the skill library dir (default: $SKILL_LIBRARY_ROOT or ./library).")
+    p.add_argument("--output", default="", help="Write JSON to this path instead of stdout.")
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    try:
+        result = recommend(args.task, args.library)
+    except Exception as exc:  # never block solving
+        result = {"verdict": "skip", "skill_id": None, "reason": f"skill_use error: {exc}"}
+    payload = json.dumps(result, ensure_ascii=False, indent=2)
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(payload)
+    print(payload)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/skills/test_evolve.py b/tests/skills/test_evolve.py
new file mode 100644
index 0000000..ad4189c
--- /dev/null
+++ b/tests/skills/test_evolve.py
@@ -0,0 +1,49 @@
+"""Unit test: evolve (growing library, usage-driven). Stubs _refine to stay LLM-free."""
+import sys, tempfile
+from pathlib import Path
+pass
+import webwright.skills.update as U
+from webwright.skills.library import Library, Skill
+
+
+def run():
+    # stub _refine: deterministically "build/widen" a skill for the group's template
+    def fake_refine(group, library):
+        from webwright.skills.update import _slug
+        sid = _slug(group[0].template)
+        library.add(Skill(sid, f"# refined from {len(group)} solves\n",
+                          {"template": group[0].template, "provenance": "test-refine"}))
+        return [sid]
+    U._refine = fake_refine
+
+    with tempfile.TemporaryDirectory() as d:
+        lib = Library(d)
+
+        # round 1: template T1 not in lib, skip verdict -> ADD
+        t1 = [U.Trace("T1", "code", verdict="skip", correct=True),
+              U.Trace("T1", "code", verdict="skip", correct=True)]
+        log1 = U.evolve(t1, lib)
+        assert log1["added"], f"new template should be added: {log1}"
+        assert len(lib.list()) == 1
+
+        # round 2: T1 now exists, all USE success -> library unchanged
+        t2 = [U.Trace("T1", "code", used_skill_id="t1", verdict="use", correct=True)]
+        log2 = U.evolve(t2, lib)
+        assert log2["use"] and not log2["added"] and not log2["adapt_refined"], log2
+        assert len(lib.list()) == 1, "pure use must not change library"
+
+        # round 3: T1 exists, an ADAPT happened -> refine back (widen)
+        t3 = [U.Trace("T1", "code2", used_skill_id="t1", verdict="adapt", correct=True)]
+        log3 = U.evolve(t3, lib)
+        assert log3["adapt_refined"], f"adapt should refine back: {log3}"
+
+        # wrong solves are dropped (not fed to refine)
+        t4 = [U.Trace("T2", "bad", verdict="skip", correct=False)]
+        log4 = U.evolve(t4, lib)
+        assert log4["dropped_wrong"] == 1 and not log4["added"], log4
+
+    print("test_evolve OK")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/tests/skills/test_gate.py b/tests/skills/test_gate.py
new file mode 100644
index 0000000..6240b3b
--- /dev/null
+++ b/tests/skills/test_gate.py
@@ -0,0 +1,35 @@
+"""Unit test: admission gate (deterministic, no external)."""
+import sys
+from pathlib import Path
+pass
+from webwright.skills.gate import gate
+
+ARR = {"type": "array", "items": {"type": "string"}}
+
+
+def run():
+    # self_verify: reject null / empty, admit non-empty
+    assert gate(None, method="self_verify").admit is False
+    assert gate([], method="self_verify").admit is False
+    assert gate("", method="self_verify").admit is False
+    assert gate(["Sprite"], method="self_verify").admit is True
+
+    # self_verify: shape must match output_schema
+    assert gate(["a", "b"], output_schema=ARR, method="self_verify").admit is True
+    assert gate({"x": 1}, output_schema=ARR, method="self_verify").admit is False, "dict != array schema"
+
+    # gold: admit iff equal
+    assert gate(["Sprite"], gold=["Sprite"], method="gold").admit is True
+    assert gate(["Pepsi"], gold=["Sprite"], method="gold").admit is False
+
+    # auto: gold present -> use gold; absent -> self_verify
+    assert gate(["Sprite"], gold=["Sprite"]).admit is True          # auto+gold -> match
+    assert gate(["Pepsi"], gold=["Sprite"]).admit is False          # auto+gold -> mismatch -> reject
+    assert gate(["anything"]).admit is True                          # auto, no gold -> self_verify pass
+    assert gate(None).admit is False                                 # auto, no gold -> self_verify fail
+
+    print("test_gate OK")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/tests/skills/test_library.py b/tests/skills/test_library.py
new file mode 100644
index 0000000..174c6c3
--- /dev/null
+++ b/tests/skills/test_library.py
@@ -0,0 +1,33 @@
+"""Unit test: library store (deterministic, no LLM)."""
+import sys, tempfile
+from pathlib import Path
+pass
+from webwright.skills.library import Library, Skill
+
+
+def run():
+    with tempfile.TemporaryDirectory() as d:
+        lib = Library(d)
+        assert lib.list() == [], "empty library should list nothing"
+        assert lib.get("nope") is None, "missing skill -> None"
+
+        sk = Skill(skill_id="s1", code="print('hi')\n",
+                   meta={"template": "do {x}", "summary": "does x", "signature": {"params": ["x"]}})
+        lib.add(sk)
+
+        got = lib.get("s1")
+        assert got is not None and got.code == "print('hi')\n", "get returns added code"
+        assert got.meta["template"] == "do {x}"
+        assert got.summary == "does x"
+        assert got.signature["params"] == ["x"]
+        assert [s.skill_id for s in lib.list()] == ["s1"], "list shows added skill"
+        assert lib.path("s1").name == "skill.py" and lib.path("s1").exists()
+
+        # re-open from disk -> persisted
+        lib2 = Library(d)
+        assert [s.skill_id for s in lib2.list()] == ["s1"], "persisted across re-open"
+    print("test_library OK")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/tests/skills/test_retrieve_decide.py b/tests/skills/test_retrieve_decide.py
new file mode 100644
index 0000000..4bf032a
--- /dev/null
+++ b/tests/skills/test_retrieve_decide.py
@@ -0,0 +1,40 @@
+"""Unit test: deterministic parts of retrieve/decide (no LLM).
+The LLM paths are smoke-tested in test_front.py."""
+import sys, tempfile
+from pathlib import Path
+pass
+from webwright.skills.library import Library, Skill
+from webwright.skills.retrieve import retrieve, Candidate
+from webwright.skills.decide import decide, Decision
+
+
+def _lib(d):
+    lib = Library(d)
+    lib.add(Skill("bestsellers", "x", {"template": "Get the top best-selling product in period",
+                                        "summary": "magento bestsellers report"}))
+    lib.add(Skill("reviews", "x", {"template": "Get reviewers who mention something",
+                                   "summary": "product page reviews"}))
+    return lib
+
+
+def run():
+    with tempfile.TemporaryDirectory() as d:
+        lib = _lib(d)
+
+        # retrieve(method="simple"): keyword overlap, deterministic
+        cands = retrieve("top best-selling product", lib, method="simple")
+        assert cands, "simple retrieve should find the bestsellers skill"
+        assert cands[0].skill.skill_id == "bestsellers", "most-overlapping skill ranked first"
+
+        cands2 = retrieve("zzz nonsense quux", lib, method="simple")
+        assert cands2 == [], "no overlap -> no candidates"
+
+        # decide with no candidates -> skip (deterministic, no LLM)
+        d0 = decide("anything", [])
+        assert isinstance(d0, Decision) and d0.verdict == "skip" and d0.skill_id is None
+
+    print("test_retrieve_decide OK")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/tests/skills/test_update.py b/tests/skills/test_update.py
new file mode 100644
index 0000000..2abfffe
--- /dev/null
+++ b/tests/skills/test_update.py
@@ -0,0 +1,36 @@
+"""Unit test: update 'grow' (deterministic, no LLM)."""
+import sys, tempfile
+from pathlib import Path
+pass
+from webwright.skills.library import Library
+from webwright.skills.update import update, Trace
+
+
+def run():
+    with tempfile.TemporaryDirectory() as d:
+        lib = Library(d)
+        assert lib.list() == []
+
+        traces = [
+            Trace(template="Get the top-{n} best-selling {entity} in {period}", code="print(1)",
+                  answer=["Sprite"], meta={"site": "shopping_admin"}),
+            Trace(template="Get the reviewers who mention {x}", code="print(2)", answer=["Bob"]),
+        ]
+        added = update(traces, lib, method="grow")
+        assert len(added) == 2, "two new templates -> two skills added"
+        assert len(lib.list()) == 2, "library grew to 2"
+
+        # idempotent: same templates again -> nothing new
+        added2 = update(traces, lib, method="grow")
+        assert added2 == [], "already-covered templates not re-added"
+        assert len(lib.list()) == 2, "library unchanged"
+
+        # the distilled skill carries its code + provenance
+        s = lib.get(added[0])
+        assert s.code == "print(1)" and s.meta["provenance"] == "distilled"
+
+    print("test_update OK")
+
+
+if __name__ == "__main__":
+    run()

From 91a2090a115a0ffb62f72ba63a94ff27852d4fe5 Mon Sep 17 00:00:00 2001
From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
Date: Tue, 30 Jun 2026 06:12:30 +0000
Subject: [PATCH 2/6] skills: prompt hint helper + skill_mode overlay;
 env-based model for skill_use CLI

- skills/prompt.with_skill_hint: prepend skill-library usage hint to task prompt (non-invasive;
  webwright merges system_template by replacement, so prompt-level is the clean way)
- config/skill_mode.yaml: optional overlay doc + step budget for skill-reuse runs
- llm._model(): bare CLI (python -m webwright.tools.skill_use) builds model from
  SKILL_MODEL_NAME/ENDPOINT (or OPENAI_*) env -> same backend as agent, no hardcoded gateway

Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
---
 src/webwright/config/skill_mode.yaml | 13 +++++++++++++
 src/webwright/skills/__init__.py     |  3 ++-
 src/webwright/skills/prompt.py       | 26 ++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 src/webwright/config/skill_mode.yaml
 create mode 100644 src/webwright/skills/prompt.py

diff --git a/src/webwright/config/skill_mode.yaml b/src/webwright/config/skill_mode.yaml
new file mode 100644
index 0000000..b8d6158
--- /dev/null
+++ b/src/webwright/config/skill_mode.yaml
@@ -0,0 +1,13 @@
+# Skill-library mode (optional overlay): raise the step budget headroom for skill reuse runs.
+# Enable by stacking:  webwright run ... -c base.yaml -c model_openai.yaml -c skill_mode.yaml
+#
+# How the agent is told to reuse skills: the SKILL-LIBRARY block is prepended to the TASK prompt
+# by the caller (see webwright.skills.prompt.with_skill_hint), NOT injected via system_template —
+# webwright merges system_template by replacement, so a prompt-level hint is the clean, non-invasive
+# way and keeps webwright's default behavior unchanged when this mode is off.
+#
+# Requires env:
+#   SKILL_LIBRARY_ROOT          path to the skill library (read by the skill_use tool)
+#   SKILL_MODEL_NAME / SKILL_MODEL_ENDPOINT  (optional) backend for skill_use; defaults to OPENAI_*
+agent:
+  step_limit: 100
diff --git a/src/webwright/skills/__init__.py b/src/webwright/skills/__init__.py
index 6f3383f..a4d119d 100644
--- a/src/webwright/skills/__init__.py
+++ b/src/webwright/skills/__init__.py
@@ -14,8 +14,9 @@
 from .gate import gate, GateResult
 from .update import evolve, Trace        # NOTE: don't import the `update` function here — it would
 from .llm import configure_llm           # shadow the `update` submodule. Use update.evolve / update.update.
+from .prompt import with_skill_hint
 
 __all__ = [
     "Library", "Skill", "retrieve", "Candidate", "decide", "Decision",
-    "gate", "GateResult", "evolve", "Trace", "configure_llm",
+    "gate", "GateResult", "evolve", "Trace", "configure_llm", "with_skill_hint",
 ]
diff --git a/src/webwright/skills/prompt.py b/src/webwright/skills/prompt.py
new file mode 100644
index 0000000..5497b1c
--- /dev/null
+++ b/src/webwright/skills/prompt.py
@@ -0,0 +1,26 @@
+"""Prompt helper: prepend a SKILL-LIBRARY hint to a task prompt so the agent reuses the library.
+
+Kept at the prompt level (not system_template) because webwright merges system_template by
+replacement; a task-prompt hint is non-invasive and leaves default behavior unchanged when unused.
+"""
+from __future__ import annotations
+
+_HINT = """## Skill library (reuse before solving from scratch)
+A library of previously-built executable code skills may contain one that helps this task.
+BEFORE planning from scratch, query it ONCE from bash:
+
+    python -m webwright.tools.skill_use --task "{task}" --library "{library}"
+
+It returns JSON {{verdict, skill_id, source_path, how_to_reuse}}:
+- "use"   : read source_path, copy it into your final_script, fill THIS task's params.
+- "adapt" : read source_path, reuse its login/navigation/extraction core, change ONLY the last step.
+- "skip"  : no useful skill — solve from scratch.
+Record your choice in skill_decision.json ({{skill_id, verdict, reason}}) before acting.
+
+---
+"""
+
+
+def with_skill_hint(task_prompt: str, *, task: str, library: str) -> str:
+    """Prepend the skill-library hint to a task prompt."""
+    return _HINT.format(task=task.replace('"', "'"), library=library) + task_prompt

From a96336b44875ebeebc9e7403e1b5f4778762ae22 Mon Sep 17 00:00:00 2001
From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:06:18 +0000
Subject: [PATCH 3/6] skills: README for the module + env-based model for bare
 CLI

- README: what the module is, the two plug points (skill_use tool + update CLI),
  components table, gate semantics, backend config, results summary
- llm._model(): bare CLI builds model from SKILL_MODEL_NAME/ENDPOINT (or OPENAI_*) env

Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
---
 src/webwright/skills/README.md | 76 ++++++++++++++++++++++++++++++++++
 src/webwright/skills/llm.py    | 14 ++++++-
 2 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 src/webwright/skills/README.md

diff --git a/src/webwright/skills/README.md b/src/webwright/skills/README.md
new file mode 100644
index 0000000..6ca27a7
--- /dev/null
+++ b/src/webwright/skills/README.md
@@ -0,0 +1,76 @@
+# `webwright.skills` — a memory / skill-library module for Webwright
+
+Turn solved tasks into **reusable, executable code skills**, retrieve and judge them at solve
+time, gate what enters the library, and grow the library incrementally. A self-evolving loop:
+
+```
+solve  --(gate: gold | self_verify)-->  admit  --(evolve: refine + parameterize + primitives)-->  library
+  ^                                                                                                   |
+  |  skill_use tool: retrieve + decide (use / adapt / skip)  <--------------------------------------- +
+```
+
+This is the missing **reuse + accumulation** layer: Webwright already turns a task into a
+parameterized script (`crafted_cli`); this module accumulates those across tasks, judges when a
+prior skill applies, and improves skills as more solves arrive — with a gate so wrong solves don't
+pollute the library.
+
+## How it plugs into Webwright
+
+Two touch points, **no change to the agent loop or default config**:
+
+1. **Reuse at solve time — the `skill_use` tool.** The agent invokes it from bash, exactly like
+   `self_reflection` / `image_qa`:
+   ```bash
+   python -m webwright.tools.skill_use --task "<the task>" --library "$SKILL_LIBRARY_ROOT"
+   ```
+   It returns JSON `{verdict: use|adapt|skip, skill_id, source_path, how_to_reuse}`. The agent
+   reads `source_path` and reuses the skill (use = as-is, adapt = reuse core + change last step,
+   skip = solve from scratch). `webwright.skills.with_skill_hint(prompt, ...)` prepends a one-line
+   usage hint to the task prompt so the agent remembers to query the library first.
+
+2. **Growth after solving — the `update` CLI.** Distill a batch of gate-passed solves into a
+   library skill (offline, not in the solve loop):
+   ```bash
+   python -m webwright.skills.update --manifest batch.json --library ./library
+   ```
+   `batch.json = {"template": "...", "runs": [{"dir","admit","params", ...}, ...]}`.
+
+## Components
+
+| file | role |
+|---|---|
+| `library.py`  | `Skill` + `Library(root)`: on-disk skills (`<id>/skill.py` + `meta.json`) |
+| `retrieve.py` | `retrieve(task, library)` → ranked `Candidate`s (relevance) |
+| `decide.py`   | `decide(task, candidates)` → `Decision(verdict, skill_id, reason)` (utility: use/adapt/skip) |
+| `gate.py`     | `gate(result, method=gold\|self_verify\|none)` → admit? (keeps wrong solves out) |
+| `update.py`   | `evolve(traces, library)`: grow on the existing library — add / adapt-refine / keep; `_refine` parameterizes + decomposes into primitives, incrementally improving an existing skill |
+| `llm.py`      | `configure_llm(model)` + `llm()`: **backend-agnostic** via Webwright's `Model` abstraction; a bare CLI builds the model from `SKILL_MODEL_NAME`/`SKILL_MODEL_ENDPOINT` (or `OPENAI_*`) env — no hardcoded endpoint/key |
+| `prompt.py`   | `with_skill_hint(prompt, task, library)`: non-invasive task-prompt hint |
+
+## Gate
+
+`gate(method=...)` is the **admission** check (independent of the solving agent — not the same as
+`self_reflection`, which is the agent's own completion condition):
+- `gold` — compare against a known answer (benchmarks); strongest.
+- `self_verify` — invariant only (non-empty + shape); weak placeholder when no gold exists. It does
+  not check *correctness*. Note: `self_reflection` cannot serve as the gate — `require_self_reflection_success`
+  makes it always `predicted_label==1`, so it would admit everything (agent grading itself).
+- `none` — admit all (demos).
+
+## Backend
+
+Backend-agnostic. Either `configure_llm(model_config_or_Model)` once in-process, or set
+`SKILL_MODEL_NAME` / `SKILL_MODEL_ENDPOINT` (falling back to `OPENAI_*`) so a bare tool invocation
+uses the same backend as the running agent. No gateway or key is hardcoded.
+
+## Results (summary)
+
+Validated with this module (full data + analysis live in the companion research repo, not here):
+- **Real website (public GitHub, read-only):** end-to-end loop works — two repos solved from
+  scratch → `update` distilled a parameterized skill → a held-out repo solved by reusing it
+  (agent called `skill_use`, verdict `use`, answer correct).
+- **Incremental growth:** a second batch improves the existing skill in place (keeps the working
+  functions, adds robustness) rather than rewriting it.
+- **Gate prevents pollution:** a wrong solve is dropped and never enters the library.
+- **Reuse value is task-dependent:** step savings are modest on easy tasks (query overhead ≈ the
+  exploration it saves) and larger on harder tasks with more exploration to skip.
diff --git a/src/webwright/skills/llm.py b/src/webwright/skills/llm.py
index 2e0703e..278694a 100644
--- a/src/webwright/skills/llm.py
+++ b/src/webwright/skills/llm.py
@@ -25,8 +25,18 @@ def configure_llm(model: Any) -> None:
 def _model() -> Any:
     if _DEFAULT_MODEL is not None:
         return _DEFAULT_MODEL
-    # default: openai model from env (OPENAI_API_KEY / OPENAI_BASE_URL respected by the model class)
-    return get_model({"model_class": "openai"})
+    # default: build an openai-style model from env so a bare CLI invocation
+    # (e.g. `python -m webwright.tools.skill_use`) uses the SAME backend as the running agent.
+    # Honors SKILL_MODEL_NAME / SKILL_MODEL_ENDPOINT (or OPENAI_* fallbacks); no hardcoded gateway.
+    import os
+    cfg = {"model_class": os.environ.get("SKILL_MODEL_CLASS", "openai")}
+    name = os.environ.get("SKILL_MODEL_NAME") or os.environ.get("OPENAI_MODEL")
+    endpoint = os.environ.get("SKILL_MODEL_ENDPOINT") or os.environ.get("OPENAI_ENDPOINT")
+    if name:
+        cfg["model_name"] = name
+    if endpoint:
+        cfg["openai_endpoint"] = endpoint
+    return get_model(cfg)
 
 
 def llm(system: str, user: str, *, model: Any = None, **_: Any) -> str:

From 82fe0ba5a40437d23ce691a1710abb2e345bc7ce Mon Sep 17 00:00:00 2001
From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:06:52 +0000
Subject: [PATCH 4/6] docs+tests: README skill-library section + skills unit
 tests

- README: Skill Library section (what it is, reuse via skill_use tool, grow via update CLI,
  end-to-end validation summary)
- tests/skills: 5 unit tests for library/gate/update/evolve/retrieve+decide

Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
---
 README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/README.md b/README.md
index 891bbd7..042c61f 100644
--- a/README.md
+++ b/README.md
@@ -165,6 +165,25 @@ python assets/task_showcase/app.py \
 
 ---
 
+## 🧠 Skill Library (reuse solved tasks across tasks)
+
+[`webwright.skills`](src/webwright/skills/) turns solved tasks into **reusable, executable code
+skills**, retrieves and judges them at solve time, gates what enters the library, and grows the
+library incrementally — a self-evolving *store → retrieve → use/adapt → gate → evolve* loop on top
+of Webwright's code-as-action solves. Plugs in with **no change to the agent loop**:
+
+- **Reuse** — the agent calls `python -m webwright.tools.skill_use --task "..." --library ...`
+  (like `self_reflection`/`image_qa`); it returns `{verdict: use|adapt|skip, source_path}`.
+- **Grow** — `python -m webwright.skills.update --manifest batch.json --library ./library`
+  distills a batch of gate-passed solves into a parameterized, primitive-decomposed skill.
+
+Validated end-to-end on a real public website (read-only GitHub): solve two repos from scratch →
+`update` builds a parameterized skill → a held-out repo is solved by reusing it (agent calls
+`skill_use`, verdict `use`, answer correct); a wrong solve is kept out by the gate; a second batch
+improves the existing skill in place. See [`src/webwright/skills/README.md`](src/webwright/skills/README.md).
+
+---
+
 ## 🚀 Quick Start
 
 ### Prerequisites

From 05aab3a081eef593da77bbaa179f7e7038de06d9 Mon Sep 17 00:00:00 2001
From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:18:12 +0000
Subject: [PATCH 5/6] skills: translate all comments/docstrings to English

Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
---
 src/webwright/skills/decide.py   |  6 ++--
 src/webwright/skills/gate.py     | 28 +++++++++++-------
 src/webwright/skills/retrieve.py |  8 +++--
 src/webwright/skills/update.py   | 51 ++++++++++++++++++--------------
 4 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/src/webwright/skills/decide.py b/src/webwright/skills/decide.py
index eaddc8d..a966adb 100644
--- a/src/webwright/skills/decide.py
+++ b/src/webwright/skills/decide.py
@@ -1,8 +1,8 @@
-"""判断用不用：候选 + 任务 → use / adapt / skip（utility）。
+"""Decide whether to use: candidates + task -> use / adapt / skip (utility).
 
-接口稳定（实现可换）：
+Stable interface (swappable implementation):
     decide(task, candidates, *, method="llm") -> Decision
-相关 ≠ 有用：retrieve 给"像不像"，decide 给"该不该用、怎么用"。
+Relevant != useful: retrieve gives "how similar", decide gives "whether and how to use it".
 """
 from __future__ import annotations
 from dataclasses import dataclass
diff --git a/src/webwright/skills/gate.py b/src/webwright/skills/gate.py
index 4ebc43e..0e9baac 100644
--- a/src/webwright/skills/gate.py
+++ b/src/webwright/skills/gate.py
@@ -1,17 +1,23 @@
-"""准入闸：只有"对的"解/技能才准进库，防 correct-but-narrow / regression 污染。
-gate 是【独立第二只眼】，与解题 agent 自己的 self_reflection 不同（后者是解题完成条件）。
+"""Admission gate: only "correct" solves/skills enter the library, preventing correct-but-narrow /
+regression pollution. The gate is an INDEPENDENT second eye — distinct from the solving agent's own
+self_reflection (which is a solve-completion condition, not an admission check).
 
-接口稳定（实现可换），method 可配置：
+Stable interface (swappable implementation), configurable method:
     gate(result, *, gold=None, output_schema=None, method="auto") -> GateResult
 
-- method="gold"        : 与 gold 比对（WebArena 等有标准答案；真独立、能挡住抽错的解）。★推荐
-- method="self_verify" : 不变量（result 非空 + shape 合 output_schema）。无 gold 时的弱占位。
-                         ⚠️ 局限：只查"有没有/形状对不对"，不查"对不对"——抽错但非空的答案照样放行。
-                         （注：webwright 的 self_reflection 因 require_self_reflection_success 而恒为
-                         predicted_label==1，故不能用它当 gate；那是解题完成条件，非独立准入。）
-- method="none"        : 不把关（纯演示复用，不防污染）。
-- method="auto"        : 有 gold 用 gold，否则 self_verify。
-升级路径（next step）：真实站用 WebJudge（OM2W 官方 judge）或跨源一致核验，做真独立把关。
+- method="gold"        : compare against gold (benchmarks like WebArena; truly independent, catches
+                         mis-extracted solves). Recommended.
+- method="self_verify" : invariant only (result non-empty + shape matches output_schema). A weak
+                         placeholder when no gold exists.
+                         Limitation: only checks "present / right shape", not "correct" — a wrong but
+                         non-empty answer is admitted anyway.
+                         (Note: webwright's self_reflection is always predicted_label==1 due to
+                         require_self_reflection_success, so it cannot serve as the gate — that is a
+                         solve-completion condition, not independent admission.)
+- method="none"        : no gate (demo reuse only, no pollution protection).
+- method="auto"        : use gold if available, else self_verify.
+Upgrade path (next step): for real websites use WebJudge (OM2W's official judge) or cross-source
+consistency checks for a truly independent gate.
 """
 from __future__ import annotations
 from dataclasses import dataclass
diff --git a/src/webwright/skills/retrieve.py b/src/webwright/skills/retrieve.py
index fae5d1e..5f7a405 100644
--- a/src/webwright/skills/retrieve.py
+++ b/src/webwright/skills/retrieve.py
@@ -1,8 +1,9 @@
-"""取：任务 → 最相关的候选技能（relevance）。
+"""Retrieve: task -> most relevant candidate skills (relevance only).
 
-接口稳定（实现可换）：
+Stable interface (swappable implementation):
     retrieve(task, library, *, k=3, method="llm") -> [Candidate]
-MVP: 单次 LLM 调用，把整库当 flat catalog 列进 prompt 让它选。库大了换 embedding，接口不变。
+MVP: a single LLM call that lists the whole library as a flat catalog in the prompt and lets it
+pick. Swap to embeddings when the library grows large — the interface stays the same.
 """
 from __future__ import annotations
 from dataclasses import dataclass
@@ -15,6 +16,7 @@
 class Candidate:
     skill: Skill
     score: float          # relevance 0..1
+
     reason: str
 
 
diff --git a/src/webwright/skills/update.py b/src/webwright/skills/update.py
index bcb4a80..61101fa 100644
--- a/src/webwright/skills/update.py
+++ b/src/webwright/skills/update.py
@@ -1,11 +1,13 @@
-"""沉淀（间歇）：把通过 gate 的解蒸馏、写回 library，让库从使用中长大。
+"""Sediment (intermittent): distill gate-passed solves back into the library so it grows from use.
 
-接口稳定（实现可换）：
-    update(traces, library, *, method="grow") -> [被加/更新的 skill_id]
+Stable interface (swappable implementation):
+    update(traces, library, *, method="grow") -> [added/updated skill_ids]
 
-- method="grow"   : 库里还没覆盖这个 template 的，就把这条成功解原样提升为技能（最小形态）。
-- method="refine" : 批量提炼——对齐 N 个 gate 过的解 → 参数化(泛化) + 拆出可复用 primitive + 薄任务层
-                    → 一个更好的库技能。这是"update 加泛化性 + primitive 复用性"的实现（单次 LLM 批量调用）。
+- method="grow"   : if the library does not yet cover this template, promote the successful solve
+                    as-is into a skill (minimal form).
+- method="refine" : batch distillation — align N gate-passed solves -> parameterize (generalize) +
+                    factor out reusable primitives + a thin task layer -> one better library skill.
+                    This is where update adds generalization + primitive reusability (one batched LLM call).
 """
 from __future__ import annotations
 import json
@@ -19,10 +21,10 @@
 @dataclass
 class Trace:
     template: str
-    code: str                       # 这条任务的 final_script（已过 gate = 正确）
+    code: str                       # this task's final_script (already gate-passed = correct)
     answer: object = None
     meta: dict = field(default_factory=dict)   # params / site / start_url / output_schema ...
-    # usage：这条任务是怎么用库的（驱动 update 的信号）
+    # usage: how this task used the library (the signal that drives update)
     used_skill_id: str | None = None
     verdict: str | None = None      # use | adapt | skip
     correct: bool = True
@@ -69,14 +71,15 @@ def _extract_code(txt: str) -> str:
 
 
 def _refine(traces: list[Trace], library: Library) -> list[str]:
-    """批量提炼：对齐 N 个 gate 过的解 → 参数化 + primitive。
-    增量：若库里已有同 template 技能，则在【现有技能基础上】改进/加宽（而非从原始解重写）。"""
+    """Batch distillation: align N gate-passed solves -> parameterize + primitives.
+    Incremental: if a skill for the same template already exists, improve/widen it on top of the
+    existing skill (rather than rewriting from the raw solves)."""
     if not traces:
         return []
     template = traces[0].template
     schema = traces[0].meta.get("output_schema")
     sid = _slug(template)
-    existing = library.get(sid)   # 已有技能？→ 增量演化
+    existing = library.get(sid)   # skill already exists? -> incremental evolution
 
     blocks = [f"## Template\n{template}\n\n## Required output_schema for retrieved_data\n{json.dumps(schema)}\n"]
     if existing and existing.code:
@@ -120,21 +123,23 @@ def _grow(traces: list[Trace], library: Library) -> list[str]:
 
 
 def evolve(traces: list[Trace], library: Library) -> dict:
-    """统一 update：在【已有库】上，按每条轨迹的 usage(use/adapt/skip)决定怎么改库。
-    这是"可持续增长的库"的核心——不是每次从零建，而是在 v_{n-1} 上长出 v_n。
+    """Unified update: evolve the EXISTING library, deciding per trace's usage (use/adapt/skip) how
+    to change it. This is the core of a continuously-growing library — not rebuilt from scratch each
+    time, but grown from v_{n-1} into v_n.
 
-    - USE   成功的轨迹：技能够好，不动库（只是复用证据）。
-    - ADAPT 成功的轨迹：复用了核心、fix 了末端 → 把这批 fix 后的解【提炼回该 template 的技能】
-                        （加宽/更稳）。这就是"fix 沉淀进库"。
-    - SKIP / 库没覆盖：该 template 还没有技能 → 用这批解【新增】一个技能。
+    - USE   (successful)        : the skill is good enough, leave it untouched (just reuse evidence).
+    - ADAPT (successful)        : core reused, last step fixed -> refine this batch's fixed solves
+                                  back into the template's skill (widen/harden). This is how a fix
+                                  sediments into the library.
+    - SKIP / not yet covered    : the template has no skill yet -> add one from this batch.
 
-    只吃 gate 过(correct=True)的轨迹（防污染）。返回一份 changelog。
+    Only consumes gate-passed (correct=True) traces (pollution protection). Returns a changelog.
     """
     good = [t for t in traces if t.correct]
     changelog = {"use": [], "adapt_refined": [], "added": [], "dropped_wrong": len(traces) - len(good)}
     existing_templates = {s.meta.get("template"): s.skill_id for s in library.list()}
 
-    # 按 template 分组（同族解一起提炼/沉淀）
+    # group by template (same-family solves are distilled/sedimented together)
     by_tmpl: dict[str, list[Trace]] = {}
     for t in good:
         by_tmpl.setdefault(t.template, []).append(t)
@@ -142,15 +147,15 @@ def evolve(traces: list[Trace], library: Library) -> dict:
     for tmpl, group in by_tmpl.items():
         verdicts = {t.verdict for t in group}
         if tmpl not in existing_templates:
-            # 库没覆盖 → 新增（用这批解提炼出一个技能）
+            # not covered -> add (distill a skill from this batch)
             sid = _refine(group, library)[0]
             changelog["added"].append(sid)
         elif "adapt" in verdicts:
-            # 有 fix 发生 → 把 fix 后的解重新提炼回该技能（加宽/更稳）
-            sid = _refine(group, library)[0]   # _refine 用同 slug，覆盖加宽
+            # a fix happened -> refine the fixed solves back into the skill (widen/harden)
+            sid = _refine(group, library)[0]   # _refine uses the same slug, overwrites + widens
             changelog["adapt_refined"].append(sid)
         else:
-            # 全 use 成功 → 技能够好，不动
+            # all use-success -> skill is good enough, leave it
             changelog["use"].append(existing_templates[tmpl])
     return changelog
 

From bb0d0cde7b5f346582f0590b0bcb9b96051fa2e5 Mon Sep 17 00:00:00 2001
From: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:22:07 +0000
Subject: [PATCH 6/6] skills: drop dead code in update (grow path superseded by
 evolve)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove _grow / update() / _UPDATERS dispatch — evolve() is the single entry now; drop the
test_update test that exercised the removed grow path. Keep retrieve/llm fallbacks (useful).

Co-Authored-By: Demi Wang <86202027+DEM1TASSE@users.noreply.github.com>
---
 src/webwright/skills/__init__.py |  4 ++--
 src/webwright/skills/update.py   | 21 -------------------
 tests/skills/test_update.py      | 36 --------------------------------
 3 files changed, 2 insertions(+), 59 deletions(-)
 delete mode 100644 tests/skills/test_update.py

diff --git a/src/webwright/skills/__init__.py b/src/webwright/skills/__init__.py
index a4d119d..79a0bf3 100644
--- a/src/webwright/skills/__init__.py
+++ b/src/webwright/skills/__init__.py
@@ -12,8 +12,8 @@
 from .retrieve import retrieve, Candidate
 from .decide import decide, Decision
 from .gate import gate, GateResult
-from .update import evolve, Trace        # NOTE: don't import the `update` function here — it would
-from .llm import configure_llm           # shadow the `update` submodule. Use update.evolve / update.update.
+from .update import evolve, Trace
+from .llm import configure_llm
 from .prompt import with_skill_hint
 
 __all__ = [
diff --git a/src/webwright/skills/update.py b/src/webwright/skills/update.py
index 61101fa..e451859 100644
--- a/src/webwright/skills/update.py
+++ b/src/webwright/skills/update.py
@@ -108,20 +108,6 @@ def _refine(traces: list[Trace], library: Library) -> list[str]:
     return [sid]
 
 
-def _grow(traces: list[Trace], library: Library) -> list[str]:
-    existing = {s.meta.get("template") for s in library.list()}
-    added = []
-    for tr in traces:
-        if not tr.template or tr.template in existing:
-            continue
-        sid = _slug(tr.template)
-        meta = {"template": tr.template, "provenance": "distilled", **tr.meta}
-        library.add(Skill(skill_id=sid, code=tr.code, meta=meta))
-        existing.add(tr.template)
-        added.append(sid)
-    return added
-
-
 def evolve(traces: list[Trace], library: Library) -> dict:
     """Unified update: evolve the EXISTING library, deciding per trace's usage (use/adapt/skip) how
     to change it. This is the core of a continuously-growing library — not rebuilt from scratch each
@@ -160,13 +146,6 @@ def evolve(traces: list[Trace], library: Library) -> dict:
     return changelog
 
 
-_UPDATERS = {"grow": _grow, "refine": _refine}
-
-
-def update(traces, library: Library, *, method: str = "grow") -> list[str]:
-    return _UPDATERS[method](traces, library)
-
-
 # ---------- CLI: batch update via a manifest ----------
 def traces_from_manifest(manifest: dict) -> list["Trace"]:
     """manifest = {"template": str, "runs": [{"dir","admit","params","answer"?,"verdict"?}, ...]}.
diff --git a/tests/skills/test_update.py b/tests/skills/test_update.py
deleted file mode 100644
index 2abfffe..0000000
--- a/tests/skills/test_update.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""Unit test: update 'grow' (deterministic, no LLM)."""
-import sys, tempfile
-from pathlib import Path
-pass
-from webwright.skills.library import Library
-from webwright.skills.update import update, Trace
-
-
-def run():
-    with tempfile.TemporaryDirectory() as d:
-        lib = Library(d)
-        assert lib.list() == []
-
-        traces = [
-            Trace(template="Get the top-{n} best-selling {entity} in {period}", code="print(1)",
-                  answer=["Sprite"], meta={"site": "shopping_admin"}),
-            Trace(template="Get the reviewers who mention {x}", code="print(2)", answer=["Bob"]),
-        ]
-        added = update(traces, lib, method="grow")
-        assert len(added) == 2, "two new templates -> two skills added"
-        assert len(lib.list()) == 2, "library grew to 2"
-
-        # idempotent: same templates again -> nothing new
-        added2 = update(traces, lib, method="grow")
-        assert added2 == [], "already-covered templates not re-added"
-        assert len(lib.list()) == 2, "library unchanged"
-
-        # the distilled skill carries its code + provenance
-        s = lib.get(added[0])
-        assert s.code == "print(1)" and s.meta["provenance"] == "distilled"
-
-    print("test_update OK")
-
-
-if __name__ == "__main__":
-    run()