diff --git a/README_en.md b/README_en.md index b409b99..f5f54fb 100644 --- a/README_en.md +++ b/README_en.md @@ -83,12 +83,75 @@ vision tool calls the vision API → returns image description ## Environment Variables -| Variable | Description | Example | -| ----------------- | ------------------------------------------------------------------ | ------------------------------- | -| `VISION_API_KEY` | Vision API key | `sk-your-api-key` | -| `VISION_API_URL` | Vision API base URL | `https://your-api-endpoint/v1` | -| `VISION_MODEL` | Vision model name
(not needed for MiniMax) | `your-vision-model` | -| `VISION_API_TYPE` | Optional, force API type
`openai` / `minimax` | `minimax` | +| Variable | Description | Example | +| ----------------------- | ------------------------------------------------------------------ | ------------------------------- | +| `VISION_MODE` | Delegation mode: `api` (default) or `subagent`. Auto-detects if unset — uses `api` when `VISION_API_KEY` is set, otherwise falls back to `subagent`. | `subagent` | +| `VISION_API_KEY` | Vision API key (required for `api` mode) | `sk-your-api-key` | +| `VISION_API_URL` | Vision API base URL (required for `api` mode) | `https://your-api-endpoint/v1` | +| `VISION_MODEL` | Vision model name (required for OpenAI-compatible backends; not needed for MiniMax) | `your-vision-model` | +| `VISION_API_TYPE` | Optional, force API type `openai` / `minimax` | `minimax` | +| `VISION_SUBAGENT_NAME` | Subagent identifier for `subagent` mode (default: `image-reader`) | `image-reader` | +| `VISION_MAX_TOKENS` | Vision API max response tokens (default: 4096) | `4096` | +| `VISION_FETCH_TIMEOUT_MS` | Fetch timeout in ms (default: 60000) | `60000` | +| `VISION_MAX_IMAGES` | LRU image cache cap (default: 200) | `200` | + +### Delegation Modes + +The plugin supports two modes for obtaining image descriptions when the active model lacks vision: + +#### `api` mode (default, original behaviour) + +The `vision` tool calls an external VLM API directly. Requires `VISION_API_KEY` and `VISION_API_URL`. Supports OpenAI-compatible backends and MiniMax VLM. + +```bash +export VISION_MODE=api +export VISION_API_KEY="sk-your-api-key" +export VISION_API_URL="https://your-api-endpoint/v1" +export VISION_MODEL="your-vision-model" +``` + +#### `subagent` mode (new) + +The plugin instructs the LLM to delegate image analysis to a vision-capable subagent via the Task tool. No external API key required — the subagent runs on whatever multimodal model is configured in opencode (e.g. `opencode-go/minimax-m3`). + +```bash +export VISION_MODE=subagent +# Optional: override the subagent name (default: image-reader) +# export VISION_SUBAGENT_NAME=image-reader +``` + +**Setup for subagent mode:** + +1. Create a subagent definition at `~/.config/opencode/agent/image-reader.md`: + +```markdown +--- +description: Analyzes images and screenshots using a multimodal model. Use when the main agent cannot view images. +mode: subagent +model: opencode-go/minimax-m3 +permission: + read: allow + glob: allow + list: allow + bash: deny + edit: deny +--- + +You are a vision analyst. Read the image at the given path using the `read` tool and describe what you see. +``` + +2. Restart opencode. The plugin will automatically: + - Save pasted images to `/tmp/opencode-vision/image{N}/` + - Inject a system prompt instructing the non-vision model to delegate + - Inject a path hint naming the subagent + +#### Auto-fallback + +When `VISION_MODE` is unset, the plugin uses: +- `api` mode if `VISION_API_KEY` is present +- `subagent` mode otherwise + +This means the plugin works out-of-the-box without any external credentials, as long as a vision-capable subagent is configured. > `VISION_API_URL`: OpenAI-compatible backends auto-append `/chat/completions`; MiniMax auto-detects and uses `/v1/coding_plan/vlm`. > diff --git a/plugins/vision-helper.ts b/plugins/vision-helper.ts index 402091d..3447603 100644 --- a/plugins/vision-helper.ts +++ b/plugins/vision-helper.ts @@ -42,11 +42,38 @@ function isPluginInjectedText(text: string): boolean { if (!text) return false return ( /^\[Image #\d+ auto-saved to /.test(text) || + /^\[Image #\d+ \w+\.\w+ auto-saved to /.test(text) || /^\[Images auto-saved to:/.test(text) || + /^\[Images \(\d+\) auto-saved to:/.test(text) || /^\[vision: image\d+\/[\w-]+\.[\w]+]$/.test(text) ) } +// ── Delegation mode ── +// +// VISION_MODE controls how non-vision models obtain image descriptions: +// "api" (default, original) — the vision tool calls an external VLM +// API (VISION_API_KEY + VISION_API_URL). This is the upstream +// behaviour and is left untouched. +// "subagent" — the plugin instructs the LLM to delegate image analysis to +// the @image-reader subagent via the Task tool. No external API +// key is required; the subagent runs on whatever vision-capable +// model is configured in opencode (e.g. opencode-go/minimax-m3). +// +// Auto-fallback: when VISION_MODE is unset, the plugin uses "api" mode if +// VISION_API_KEY is present, otherwise it falls back to "subagent" mode so +// the plugin works out-of-the-box without any external credentials. +// +// VISION_SUBAGENT_NAME overrides the subagent identifier injected into the +// system prompt and path hint (default: "image-reader"). +const VISION_MODE_RAW = (process.env["VISION_MODE"] || "").toLowerCase() +const VISION_SUBAGENT_NAME = process.env["VISION_SUBAGENT_NAME"] || "image-reader" +const hasApiKey = !!process.env["VISION_API_KEY"] +const VISION_MODE: "api" | "subagent" = + VISION_MODE_RAW === "subagent" ? "subagent" : + VISION_MODE_RAW === "api" ? "api" : + hasApiKey ? "api" : "subagent" + /** * Hook runs just before messages are sent to the model. For every user message * with attached images: @@ -100,6 +127,9 @@ export default (async () => { "**Native-vision models should NEVER call this tool — use the built-in `read` tool instead, which returns the actual image attachment directly. This tool exists for text-only models that cannot parse image bytes returned by `read`.**", ].join("\n") } + // In subagent mode, the vision tool is typically absent (no VISION_API_KEY). + // OpenCode silently ignores tool.definition calls for tools that don't + // exist, so this hook is a no-op in that case — no extra guard needed. }, "experimental.chat.system.transform": async (input, output) => { const model = input.model as unknown as { @@ -112,6 +142,10 @@ export default (async () => { output.system.push( "You have native image input capabilities. You can directly view and analyze images attached to user messages. Do NOT call the `vision` tool to read images sent by the user — analyze them natively instead.", ) + } else if (VISION_MODE === "subagent") { + output.system.push( + `IMPORTANT: This model does NOT support image input. When a user attaches an image or screenshot, OpenCode will save it to a temp directory and inject a path hint like '[Image #N auto-saved to /tmp/opencode-vision/imageN/hash.png]'. You MUST delegate image analysis to the @${VISION_SUBAGENT_NAME} subagent via the Task tool (subagent_type="${VISION_SUBAGENT_NAME}", prompt="Read and describe the image at "). The @${VISION_SUBAGENT_NAME} subagent runs on a multimodal model (e.g. opencode-go/minimax-m3) that can read images. Never attempt to read images directly with the \`read\` tool — it will fail with 'model does not support image input'.`, + ) } }, "experimental.chat.messages.transform": async (_input, output) => { @@ -177,15 +211,18 @@ export default (async () => { if (saved.length === 0) continue - // Build path hint(s). Intentionally does NOT guide the LLM to the - // vision tool — the tool.definition hook above steers native-vision - // models toward the built-in read tool, and the vision tool's own - // description recommends read for native-vision models. The hint - // just records where the temp copy is, so any model that needs it - // can find it. + // Build path hint(s). The hint records where the temp copy lives so + // any model that needs it can find it. In "api" mode the hint is + // neutral (the vision tool's own description handles routing); in + // "subagent" mode the hint explicitly names the @image-reader + // subagent so the LLM knows to delegate via the Task tool. + const hintSuffix = VISION_MODE === "subagent" + ? ` — use @${VISION_SUBAGENT_NAME} subagent via Task tool to analyze it if you cannot view images natively]` + : `]` + const hints = saved.length === 1 - ? `[Image #${saved[0].seq} auto-saved to ${path.join(TMP_DIR, `image${saved[0].seq}`, saved[0].name)}]` - : `[Images auto-saved to:\n${saved.map((s) => ` ${path.join(TMP_DIR, `image${s.seq}`, s.name)}`).join("\n")}]` + ? `[Image #${saved[0].seq} ${saved[0].name} auto-saved to ${path.join(TMP_DIR, `image${saved[0].seq}`, saved[0].name)}${hintSuffix}` + : `[Images (${saved.length}) auto-saved to:\n${saved.map((s) => ` ${path.join(TMP_DIR, `image${s.seq}`, s.name)}`).join("\n")}${hintSuffix}` msg.parts.push({ type: "text",