diff --git a/README_en.md b/README_en.md
index b409b99..f5f54fb 100644
--- a/README_en.md
+++ b/README_en.md
@@ -83,12 +83,75 @@ vision tool calls the vision API → returns image description
## Environment Variables
-| Variable | Description | Example |
-| ----------------- | ------------------------------------------------------------------ | ------------------------------- |
-| `VISION_API_KEY` | Vision API key | `sk-your-api-key` |
-| `VISION_API_URL` | Vision API base URL | `https://your-api-endpoint/v1` |
-| `VISION_MODEL` | Vision model name
(not needed for MiniMax) | `your-vision-model` |
-| `VISION_API_TYPE` | Optional, force API type
`openai` / `minimax` | `minimax` |
+| Variable | Description | Example |
+| ----------------------- | ------------------------------------------------------------------ | ------------------------------- |
+| `VISION_MODE` | Delegation mode: `api` (default) or `subagent`. Auto-detects if unset — uses `api` when `VISION_API_KEY` is set, otherwise falls back to `subagent`. | `subagent` |
+| `VISION_API_KEY` | Vision API key (required for `api` mode) | `sk-your-api-key` |
+| `VISION_API_URL` | Vision API base URL (required for `api` mode) | `https://your-api-endpoint/v1` |
+| `VISION_MODEL` | Vision model name (required for OpenAI-compatible backends; not needed for MiniMax) | `your-vision-model` |
+| `VISION_API_TYPE` | Optional, force API type `openai` / `minimax` | `minimax` |
+| `VISION_SUBAGENT_NAME` | Subagent identifier for `subagent` mode (default: `image-reader`) | `image-reader` |
+| `VISION_MAX_TOKENS` | Vision API max response tokens (default: 4096) | `4096` |
+| `VISION_FETCH_TIMEOUT_MS` | Fetch timeout in ms (default: 60000) | `60000` |
+| `VISION_MAX_IMAGES` | LRU image cache cap (default: 200) | `200` |
+
+### Delegation Modes
+
+The plugin supports two modes for obtaining image descriptions when the active model lacks vision:
+
+#### `api` mode (default, original behaviour)
+
+The `vision` tool calls an external VLM API directly. Requires `VISION_API_KEY` and `VISION_API_URL`. Supports OpenAI-compatible backends and MiniMax VLM.
+
+```bash
+export VISION_MODE=api
+export VISION_API_KEY="sk-your-api-key"
+export VISION_API_URL="https://your-api-endpoint/v1"
+export VISION_MODEL="your-vision-model"
+```
+
+#### `subagent` mode (new)
+
+The plugin instructs the LLM to delegate image analysis to a vision-capable subagent via the Task tool. No external API key required — the subagent runs on whatever multimodal model is configured in opencode (e.g. `opencode-go/minimax-m3`).
+
+```bash
+export VISION_MODE=subagent
+# Optional: override the subagent name (default: image-reader)
+# export VISION_SUBAGENT_NAME=image-reader
+```
+
+**Setup for subagent mode:**
+
+1. Create a subagent definition at `~/.config/opencode/agent/image-reader.md`:
+
+```markdown
+---
+description: Analyzes images and screenshots using a multimodal model. Use when the main agent cannot view images.
+mode: subagent
+model: opencode-go/minimax-m3
+permission:
+ read: allow
+ glob: allow
+ list: allow
+ bash: deny
+ edit: deny
+---
+
+You are a vision analyst. Read the image at the given path using the `read` tool and describe what you see.
+```
+
+2. Restart opencode. The plugin will automatically:
+ - Save pasted images to `/tmp/opencode-vision/image{N}/`
+ - Inject a system prompt instructing the non-vision model to delegate
+ - Inject a path hint naming the subagent
+
+#### Auto-fallback
+
+When `VISION_MODE` is unset, the plugin uses:
+- `api` mode if `VISION_API_KEY` is present
+- `subagent` mode otherwise
+
+This means the plugin works out-of-the-box without any external credentials, as long as a vision-capable subagent is configured.
> `VISION_API_URL`: OpenAI-compatible backends auto-append `/chat/completions`; MiniMax auto-detects and uses `/v1/coding_plan/vlm`.
>
diff --git a/plugins/vision-helper.ts b/plugins/vision-helper.ts
index 402091d..3447603 100644
--- a/plugins/vision-helper.ts
+++ b/plugins/vision-helper.ts
@@ -42,11 +42,38 @@ function isPluginInjectedText(text: string): boolean {
if (!text) return false
return (
/^\[Image #\d+ auto-saved to /.test(text) ||
+ /^\[Image #\d+ \w+\.\w+ auto-saved to /.test(text) ||
/^\[Images auto-saved to:/.test(text) ||
+ /^\[Images \(\d+\) auto-saved to:/.test(text) ||
/^\[vision: image\d+\/[\w-]+\.[\w]+]$/.test(text)
)
}
+// ── Delegation mode ──
+//
+// VISION_MODE controls how non-vision models obtain image descriptions:
+// "api" (default, original) — the vision tool calls an external VLM
+// API (VISION_API_KEY + VISION_API_URL). This is the upstream
+// behaviour and is left untouched.
+// "subagent" — the plugin instructs the LLM to delegate image analysis to
+// the @image-reader subagent via the Task tool. No external API
+// key is required; the subagent runs on whatever vision-capable
+// model is configured in opencode (e.g. opencode-go/minimax-m3).
+//
+// Auto-fallback: when VISION_MODE is unset, the plugin uses "api" mode if
+// VISION_API_KEY is present, otherwise it falls back to "subagent" mode so
+// the plugin works out-of-the-box without any external credentials.
+//
+// VISION_SUBAGENT_NAME overrides the subagent identifier injected into the
+// system prompt and path hint (default: "image-reader").
+const VISION_MODE_RAW = (process.env["VISION_MODE"] || "").toLowerCase()
+const VISION_SUBAGENT_NAME = process.env["VISION_SUBAGENT_NAME"] || "image-reader"
+const hasApiKey = !!process.env["VISION_API_KEY"]
+const VISION_MODE: "api" | "subagent" =
+ VISION_MODE_RAW === "subagent" ? "subagent" :
+ VISION_MODE_RAW === "api" ? "api" :
+ hasApiKey ? "api" : "subagent"
+
/**
* Hook runs just before messages are sent to the model. For every user message
* with attached images:
@@ -100,6 +127,9 @@ export default (async () => {
"**Native-vision models should NEVER call this tool — use the built-in `read` tool instead, which returns the actual image attachment directly. This tool exists for text-only models that cannot parse image bytes returned by `read`.**",
].join("\n")
}
+ // In subagent mode, the vision tool is typically absent (no VISION_API_KEY).
+ // OpenCode silently ignores tool.definition calls for tools that don't
+ // exist, so this hook is a no-op in that case — no extra guard needed.
},
"experimental.chat.system.transform": async (input, output) => {
const model = input.model as unknown as {
@@ -112,6 +142,10 @@ export default (async () => {
output.system.push(
"You have native image input capabilities. You can directly view and analyze images attached to user messages. Do NOT call the `vision` tool to read images sent by the user — analyze them natively instead.",
)
+ } else if (VISION_MODE === "subagent") {
+ output.system.push(
+ `IMPORTANT: This model does NOT support image input. When a user attaches an image or screenshot, OpenCode will save it to a temp directory and inject a path hint like '[Image #N auto-saved to /tmp/opencode-vision/imageN/hash.png]'. You MUST delegate image analysis to the @${VISION_SUBAGENT_NAME} subagent via the Task tool (subagent_type="${VISION_SUBAGENT_NAME}", prompt="Read and describe the image at "). The @${VISION_SUBAGENT_NAME} subagent runs on a multimodal model (e.g. opencode-go/minimax-m3) that can read images. Never attempt to read images directly with the \`read\` tool — it will fail with 'model does not support image input'.`,
+ )
}
},
"experimental.chat.messages.transform": async (_input, output) => {
@@ -177,15 +211,18 @@ export default (async () => {
if (saved.length === 0) continue
- // Build path hint(s). Intentionally does NOT guide the LLM to the
- // vision tool — the tool.definition hook above steers native-vision
- // models toward the built-in read tool, and the vision tool's own
- // description recommends read for native-vision models. The hint
- // just records where the temp copy is, so any model that needs it
- // can find it.
+ // Build path hint(s). The hint records where the temp copy lives so
+ // any model that needs it can find it. In "api" mode the hint is
+ // neutral (the vision tool's own description handles routing); in
+ // "subagent" mode the hint explicitly names the @image-reader
+ // subagent so the LLM knows to delegate via the Task tool.
+ const hintSuffix = VISION_MODE === "subagent"
+ ? ` — use @${VISION_SUBAGENT_NAME} subagent via Task tool to analyze it if you cannot view images natively]`
+ : `]`
+
const hints = saved.length === 1
- ? `[Image #${saved[0].seq} auto-saved to ${path.join(TMP_DIR, `image${saved[0].seq}`, saved[0].name)}]`
- : `[Images auto-saved to:\n${saved.map((s) => ` ${path.join(TMP_DIR, `image${s.seq}`, s.name)}`).join("\n")}]`
+ ? `[Image #${saved[0].seq} ${saved[0].name} auto-saved to ${path.join(TMP_DIR, `image${saved[0].seq}`, saved[0].name)}${hintSuffix}`
+ : `[Images (${saved.length}) auto-saved to:\n${saved.map((s) => ` ${path.join(TMP_DIR, `image${s.seq}`, s.name)}`).join("\n")}${hintSuffix}`
msg.parts.push({
type: "text",