diff --git a/packages/types/src/__tests__/vscode-llm.spec.ts b/packages/types/src/__tests__/vscode-llm.spec.ts new file mode 100644 index 000000000..7a2eabddf --- /dev/null +++ b/packages/types/src/__tests__/vscode-llm.spec.ts @@ -0,0 +1,36 @@ +import { describe, it, expect } from "vitest" +import { vscodeLlmModels, vscodeLlmDefaultModelId } from "../providers/vscode-llm.js" + +describe("vscodeLlmModels", () => { + it("exposes the opus-4.8 row with its measured maxInputTokens and contextWindow", () => { + // The VS Code LM API exposes only maxInputTokens; that is the value the UI reads from this + // table (useSelectedModel.ts). For claude-opus-4.8 the two fields intentionally DIVERGE: + // maxInputTokens (197897) is the enforced input ceiling, while contextWindow (679560) records + // the larger advertised window. The UI reads maxInputTokens, so the divergence is a deliberate + // tripwire — assert the actual on-disk literals rather than forcing equality. + expect(vscodeLlmModels).toHaveProperty("claude-opus-4.8") + expect(vscodeLlmModels["claude-opus-4.8"].contextWindow).toBe(679560) + expect(vscodeLlmModels["claude-opus-4.8"].maxInputTokens).toBe(197897) + }) + it("preserves the real window for models captured with a smaller maxInputTokens", () => { + expect(vscodeLlmModels["gpt-4o-mini"].maxInputTokens).toBe(12078) + expect(vscodeLlmModels["gpt-4o-mini"].contextWindow).toBe(12078) + expect(vscodeLlmModels["gemini-2.5-pro"].contextWindow).toBe(108594) + expect(vscodeLlmModels["gemini-2.5-pro"].maxInputTokens).toBe(108594) + }) + it("keeps both window fields populated and positive for every row", () => { + for (const [family, model] of Object.entries(vscodeLlmModels)) { + expect(model.contextWindow, `${family}: contextWindow must be a positive integer`).toBeGreaterThan(0) + expect(model.maxInputTokens, `${family}: maxInputTokens must be a positive integer`).toBeGreaterThan(0) + } + }) + it("excludes fabricated/internal/alias families and the dropped legacy rows", () => { + expect(vscodeLlmModels).not.toHaveProperty("claude-opus-4.7-high") + expect(vscodeLlmModels).not.toHaveProperty("claude-3.5-sonnet") + expect(vscodeLlmModels).not.toHaveProperty("claude-4-sonnet") + }) + it("defaults to a model id that exists in the table", () => { + expect(vscodeLlmDefaultModelId).toBe("claude-sonnet-4.5") + expect(vscodeLlmModels).toHaveProperty(vscodeLlmDefaultModelId) + }) +}) diff --git a/packages/types/src/providers/vscode-llm.ts b/packages/types/src/providers/vscode-llm.ts index efe069191..46df75fac 100644 --- a/packages/types/src/providers/vscode-llm.ts +++ b/packages/types/src/providers/vscode-llm.ts @@ -2,189 +2,222 @@ import type { ModelInfo } from "../model.js" export type VscodeLlmModelId = keyof typeof vscodeLlmModels -export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-3.5-sonnet" +export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-sonnet-4.5" -// https://docs.cline.bot/provider-config/vscode-language-model-api +// Curated VS Code LM (GitHub Copilot) model catalog. +// +// The VS Code LM API only exposes `maxInputTokens` per model; it does NOT report a separate +// total context window. For each row, `contextWindow` records the model's advertised window +// while `maxInputTokens` is the enforced input ceiling the UI actually reads (via +// useSelectedModel.ts) and the condense gate measures against. For most rows the two values +// match. They intentionally DIVERGE only where the provider advertises a larger window than the +// usable input ceiling (e.g. claude-opus-4.8): keeping both fields lets the context bar and the +// auto-condense gate stay on a single source of truth (maxInputTokens) without losing the real +// advertised window. export const vscodeLlmModels = { - "gpt-3.5-turbo": { - contextWindow: 12114, - supportsImages: false, + "claude-opus-4.8": { + contextWindow: 679560, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-3.5-turbo", - version: "gpt-3.5-turbo-0613", - name: "GPT 3.5 Turbo", + family: "claude-opus-4.8", + version: "claude-opus-4.8", + name: "Claude Opus 4.8", supportsToolCalling: true, - maxInputTokens: 12114, + maxInputTokens: 197897, }, - "gpt-4o-mini": { - contextWindow: 12115, - supportsImages: false, + "claude-opus-4.7": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o-mini", - version: "gpt-4o-mini-2024-07-18", - name: "GPT-4o mini", + family: "claude-opus-4.7", + version: "claude-opus-4.7", + name: "Claude Opus 4.7", supportsToolCalling: true, - maxInputTokens: 12115, + maxInputTokens: 197897, }, - "gpt-4": { - contextWindow: 28501, - supportsImages: false, + "claude-opus-4.6": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4", - version: "gpt-4-0613", - name: "GPT 4", + family: "claude-opus-4.6", + version: "claude-opus-4.6", + name: "Claude Opus 4.6", supportsToolCalling: true, - maxInputTokens: 28501, + maxInputTokens: 197897, }, - "gpt-4-0125-preview": { - contextWindow: 63826, - supportsImages: false, + "claude-opus-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4-turbo", - version: "gpt-4-0125-preview", - name: "GPT 4 Turbo", + family: "claude-opus-4.5", + version: "claude-opus-4.5", + name: "Claude Opus 4.5", supportsToolCalling: true, - maxInputTokens: 63826, + maxInputTokens: 167790, }, - "gpt-4o": { - contextWindow: 63827, + "claude-sonnet-4.6": { + contextWindow: 197896, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o", - version: "gpt-4o-2024-11-20", - name: "GPT-4o", + family: "claude-sonnet-4.6", + version: "claude-sonnet-4.6", + name: "Claude Sonnet 4.6", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 197896, }, - o1: { - contextWindow: 19827, - supportsImages: false, + "claude-sonnet-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o1-ga", - version: "o1-2024-12-17", - name: "o1 (Preview)", + family: "claude-sonnet-4.5", + version: "claude-sonnet-4.5", + name: "Claude Sonnet 4.5", supportsToolCalling: true, - maxInputTokens: 19827, + maxInputTokens: 167790, }, - "o3-mini": { - contextWindow: 63827, - supportsImages: false, + "claude-haiku-4.5": { + contextWindow: 135790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o3-mini", - version: "o3-mini-2025-01-31", - name: "o3-mini", + family: "claude-haiku-4.5", + version: "claude-haiku-4.5", + name: "Claude Haiku 4.5", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 135790, }, - "claude-3.5-sonnet": { - contextWindow: 81638, + "gpt-5.5": { + contextWindow: 268426, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-3.5-sonnet", - version: "claude-3.5-sonnet", - name: "Claude 3.5 Sonnet", + family: "gpt-5.5", + version: "gpt-5.5", + name: "GPT-5.5", supportsToolCalling: true, - maxInputTokens: 81638, + maxInputTokens: 268426, }, - "claude-4-sonnet": { - contextWindow: 128000, + "gpt-5.4": { + contextWindow: 268424, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-sonnet-4", - version: "claude-sonnet-4", - name: "Claude Sonnet 4", + family: "gpt-5.4", + version: "gpt-5.4", + name: "GPT-5.4", supportsToolCalling: true, - maxInputTokens: 111836, + maxInputTokens: 268424, }, - "gemini-2.0-flash-001": { - contextWindow: 127827, + "gpt-5.4-mini": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.0-flash", - version: "gemini-2.0-flash-001", - name: "Gemini 2.0 Flash", - supportsToolCalling: false, - maxInputTokens: 127827, + family: "gpt-5.4-mini", + version: "gpt-5.4-mini", + name: "GPT-5.4 mini", + supportsToolCalling: true, + maxInputTokens: 271790, }, - "gemini-2.5-pro": { - contextWindow: 128000, + "gpt-5.3-codex": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.5-pro", - version: "gemini-2.5-pro-preview-03-25", - name: "Gemini 2.5 Pro (Preview)", + family: "gpt-5.3-codex", + version: "gpt-5.3-codex", + name: "GPT-5.3-Codex", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 271790, }, - "o4-mini": { - contextWindow: 128000, + "gpt-5-mini": { + contextWindow: 127790, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gpt-5-mini", + version: "gpt-5-mini", + name: "GPT-5 mini", + supportsToolCalling: true, + maxInputTokens: 127790, + }, + "gpt-4o-mini": { + contextWindow: 12078, supportsImages: false, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o4-mini", - version: "o4-mini-2025-04-16", - name: "o4-mini (Preview)", + family: "gpt-4o-mini", + version: "gpt-4o-mini-2024-07-18", + name: "GPT-4o mini", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 12078, }, - "gpt-4.1": { - contextWindow: 128000, + "gemini-3.1-pro-preview": { + contextWindow: 197897, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4.1", - version: "gpt-4.1-2025-04-14", - name: "GPT-4.1 (Preview)", + family: "gemini-3.1-pro-preview", + version: "gemini-3.1-pro-preview", + name: "Gemini 3.1 Pro (Preview)", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 197897, }, - "gpt-5-mini": { - contextWindow: 128000, + "gemini-3.5-flash": { + contextWindow: 197895, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5-mini", - version: "gpt-5-mini", - name: "GPT-5 mini (Preview)", + family: "gemini-3.5-flash", + version: "gemini-3.5-flash", + name: "Gemini 3.5 Flash", + supportsToolCalling: true, + maxInputTokens: 197895, + }, + "gemini-3-flash": { + contextWindow: 108594, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gemini-3-flash", + version: "gemini-3-flash-preview", + name: "Gemini 3 Flash (Preview)", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, - "gpt-5": { - contextWindow: 128000, + "gemini-2.5-pro": { + contextWindow: 108594, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5", - version: "gpt-5", - name: "GPT-5 (Preview)", + family: "gemini-2.5-pro", + version: "gemini-2.5-pro", + name: "Gemini 2.5 Pro", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, } as const satisfies Record< string, diff --git a/src/api/index.ts b/src/api/index.ts index 0c901f8e2..00201b0d2 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -107,6 +107,17 @@ export interface ApiHandler { getModel(): { id: string; info: ModelInfo } + /** + * Optional: the context window (in tokens) to use for context-management / + * auto-condense decisions, when it must differ from getModel().info.contextWindow. + * + * Only the VS Code LM (Copilot) provider overrides this, to measure usage against the + * model's static `maxInputTokens` instead of the inflated live window VS Code reports. + * Other providers leave it undefined and callers fall back to getModel().info.contextWindow, + * so their behavior is unchanged. + */ + getCondenseContextWindow?(): number + /** * Counts tokens for content blocks * All providers extend BaseProvider which provides a default tiktoken implementation, diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index a79a5a4bc..eb026e816 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -63,6 +63,7 @@ import * as vscode from "vscode" import { VsCodeLmHandler } from "../vscode-lm" import type { ApiHandlerOptions } from "../../../shared/api" import type { Anthropic } from "@anthropic-ai/sdk" +import { openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" const mockLanguageModelChat = { id: "test-model", @@ -440,6 +441,88 @@ describe("VsCodeLmHandler", () => { const model = handler.getModel() expect(model.info).toBeDefined() }) + + it("should use the full advertised maxInputTokens without an upper cap", async () => { + // VS Code can report a very large advertised window; getModel surfaces it as-is + // (Math.max(0, maxInputTokens)) rather than clamping to a smaller default. + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 936000 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(936000) + }) + + it("should pass through a small maxInputTokens unchanged", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 4096 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(4096) + }) + + it("should fall back to sane defaults when maxInputTokens is not a number", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: undefined as unknown as number } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(openAiModelInfoSaneDefaults.contextWindow) + }) + }) + + describe("getCondenseContextWindow", () => { + it("uses the static-table maxInputTokens for a known VS Code LM family", () => { + const opusHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family: "claude-opus-4.8" }, + }) + expect(opusHandler.getCondenseContextWindow()).toBe(vscodeLlmModels["claude-opus-4.8"].maxInputTokens) + opusHandler.dispose() + }) + + it("falls back to the live model context window for families not in the static table", () => { + // test-family is not a curated row, so the gate uses the live runtime window. + handler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat + expect(handler.getCondenseContextWindow()).toBe(handler.getModel().info.contextWindow) + expect(handler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) + }) + + it("falls back to the live window when no family is resolvable (no client, no selector family)", () => { + // With neither a client nor a selector family, `family` is undefined, so the static-table + // lookup is skipped entirely and the gate uses getModel().info.contextWindow (fallback info). + const noFamilyHandler = new VsCodeLmHandler({ vsCodeLmModelSelector: { vendor: "copilot" } }) + noFamilyHandler["client"] = null + expect(noFamilyHandler.getCondenseContextWindow()).toBe(noFamilyHandler.getModel().info.contextWindow) + expect(noFamilyHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) + noFamilyHandler.dispose() + }) + + it("falls back to the derived window when the static row exists but maxInputTokens is non-positive", () => { + // Guard sub-condition: a curated family is found but its maxInputTokens is <= 0 (corrupt/zeroed). + // With the selector family `claude-opus-4.8` and no live client, the zeroed static row is the one + // consulted, so the `maxInputTokens > 0` guard fails and the gate falls back to the derived window + // from getModel().info.contextWindow (sane defaults here, since there is no live client). + const family = "claude-opus-4.8" + const original = vscodeLlmModels[family].maxInputTokens + try { + ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = 0 + const guardHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family }, + }) + // Leave the client unset so `family` resolves from the selector (claude-opus-4.8), + // forcing the zeroed static row to be read instead of a live client's family. + guardHandler["client"] = null + expect(guardHandler.getCondenseContextWindow()).toBe(guardHandler.getModel().info.contextWindow) + expect(guardHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) + guardHandler.dispose() + } finally { + ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = original + } + }) }) describe("countTokens", () => { diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index 8fb564a9d..d730658b4 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -2,7 +2,7 @@ import { Anthropic } from "@anthropic-ai/sdk" import * as vscode from "vscode" import OpenAI from "openai" -import { type ModelInfo, openAiModelInfoSaneDefaults } from "@roo-code/types" +import { type ModelInfo, openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" import type { ApiHandlerOptions } from "../../shared/api" import { SELECTOR_SEPARATOR, stringifyVsCodeLmModelSelector } from "../../shared/vsCodeSelectorUtils" @@ -562,6 +562,28 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } } + /** + * Context window used for auto-condense / context-management decisions. + * + * VS Code's LM API reports `client.maxInputTokens` as Copilot's *advertised* window, + * which is far larger than the realistic usable window; relying on it keeps auto-condense + * from ever firing. For condense decisions we instead measure usage against the curated + * static table's `maxInputTokens` — the same value the context bar uses via + * `useSelectedModel` — so the gate and the gauge stay on one source of truth. + * + * Falls back to the live runtime window when the selected model isn't in the static table. + */ + getCondenseContextWindow(): number { + const family = this.client?.family ?? this.options.vsCodeLmModelSelector?.family + const staticModel = family ? vscodeLlmModels[family as keyof typeof vscodeLlmModels] : undefined + + if (staticModel && typeof staticModel.maxInputTokens === "number" && staticModel.maxInputTokens > 0) { + return staticModel.maxInputTokens + } + + return this.getModel().info.contextWindow + } + async completePrompt(prompt: string): Promise { try { const client = await this.getClient() diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index 9950ec536..ba0a77aac 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -810,9 +810,10 @@ describe("Context Management", () => { const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") const modelInfo = createModelInfo(100000, 30000) - // Set tokens to be below both the allowedTokens threshold and the percentage threshold + // Usage is measured against available input space (contextWindow - maxTokens reserve). + // available = 100000 - 30000 = 70000; 30000 / 70000 ≈ 43% < 50% threshold. const contextWindow = modelInfo.contextWindow - const totalTokens = 40000 // 40% of context window + const totalTokens = 30000 const messagesWithSmallContent = [ ...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }, @@ -825,7 +826,7 @@ describe("Context Management", () => { maxTokens: modelInfo.maxTokens, apiHandler: mockApiHandler, autoCondenseContext: true, - autoCondenseContextPercent: 50, // Set threshold to 50% - our tokens are at 40% + autoCondenseContextPercent: 50, // Set threshold to 50% - usage is ~43% of available input systemPrompt: "System prompt", taskId, profileThresholds: {}, @@ -1507,19 +1508,42 @@ describe("Context Management", () => { }) it("should return false when context percent is below threshold", () => { + // Available-input denominator (opt-in): available = 100000 - 30000 = 70000; + // 30000 / 70000 ≈ 43% < 50% threshold. const result = willManageContext({ - totalTokens: 40000, - contextWindow: 100000, // 40% of context window + totalTokens: 30000, + contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, - autoCondenseContextPercent: 50, // 50% threshold + autoCondenseContextPercent: 50, // 50% threshold; usage is ~43% of available input profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(false) }) + it("should treat a negative maxTokens (vscode-lm reports -1) as the default reserve, not -1", () => { + // vscode-lm reports maxTokens: -1 (unlimited). A naive `maxTokens || DEFAULT` keeps -1, + // which would make allowedTokens balloon past the window and skew the percentage. The + // guard must treat -1 like an unknown reserve (ANTHROPIC_DEFAULT_MAX_TOKENS for the + // allowed-tokens math, zero reserve for the available-input percentage). + // With autoCondenseContext disabled, only the allowedTokens path can trigger: + // allowedTokens = 100000 * 0.9 - 8192 = 81808; totalTokens 85000 > 81808 → true. + const result = willManageContext({ + totalTokens: 85000, + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, + autoCondenseContextPercent: 50, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + it("should return true when tokens exceed allowedTokens even if autoCondenseContext is false", () => { // allowedTokens = contextWindow * (1 - 0.1) - reservedTokens = 100000 * 0.9 - 30000 = 60000 const result = willManageContext({ @@ -1581,10 +1605,12 @@ describe("Context Management", () => { }) it("should include lastMessageTokens in the calculation", () => { - // Without lastMessageTokens: 49000 tokens = 49% - // With lastMessageTokens: 49000 + 2000 = 51000 tokens = 51% + // Available-input denominator (opt-in): available = 100000 - 30000 = 70000. + // Without lastMessageTokens: 34000 / 70000 ≈ 48.6% < 50% threshold. + // With lastMessageTokens: (34000 + 2000) / 70000 ≈ 51.4% ≥ 50% threshold. + // (Against the full window both cases are < 50%, so this case requires the opt-in flag.) const resultWithoutLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, @@ -1592,18 +1618,20 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(resultWithoutLastMessage).toBe(false) const resultWithLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, autoCondenseContextPercent: 50, // 50% threshold profileThresholds: {}, currentProfileId: "default", - lastMessageTokens: 2000, // Pushes total to 51% + lastMessageTokens: 2000, // Pushes usage over 50% of available input + useAvailableInputForContextPercent: true, }) expect(resultWithLastMessage).toBe(true) }) @@ -1701,4 +1729,313 @@ describe("Context Management", () => { expect(result.newContextTokensAfterTruncation).toBeGreaterThan(0) }) }) + + /** + * Regression tests for the opt-in available-input denominator (vscode-lm). With the flag on, + * the condense gate measures usage against available input space (contextWindow - reserved + * output), not the raw context window. This keeps the gate in lockstep with the UI context + * gauge and ensures it actually fires for vscode-lm, which reports maxTokens: -1. The default + * (full-window) behavior for every other provider is covered by the sibling describe below. + */ + describe("contextPercent uses available input space (opt-in, regression)", () => { + const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({ + contextWindow, + supportsPromptCache: true, + maxTokens, + }) + + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "Fifth message" }, + ] + + it("willManageContext measures the percentage against available input, not the full window", () => { + // contextWindow 200000, reserve 64000 → available input 136000. + // totalTokens 100000 → 100000 / 136000 ≈ 73.5%, which clears the 70% threshold. + // Against the full window it would be only 50% and the gate would (wrongly) stay closed. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("willManageContext stays below threshold when usage is under available input", () => { + // available input 136000; totalTokens 90000 → ≈ 66.2% < 70% threshold. + const result = willManageContext({ + totalTokens: 90000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(false) + }) + + it("willManageContext treats an unlimited (-1) reserve as zero reserve for the percentage", () => { + // vscode-lm reports maxTokens: -1. The percentage denominator should fall back to the + // full window (zero reserve): 150000 / 200000 = 75% ≥ 70% threshold. + const result = willManageContext({ + totalTokens: 150000, + contextWindow: 200000, + maxTokens: -1, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("willManageContext falls back to 100% when the reserve is >= the window (availableInput <= 0)", () => { + // When maxTokens (reserve) >= contextWindow, availableInputTokens = window - reserve <= 0. + // The denominator guard must short-circuit contextPercent to 100 rather than divide by + // a non-positive number, so the gate fires regardless of the (tiny) totalTokens. + const result = willManageContext({ + totalTokens: 1, + contextWindow: 50000, + maxTokens: 60000, // reserve > window → availableInput = -10000 + autoCondenseContext: true, + autoCondenseContextPercent: 80, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + // contextPercent === 100 >= 80 threshold → true. + expect(result).toBe(true) + }) + + it("willManageContext falls back to 100% when the reserve exactly equals the window (availableInput === 0)", () => { + // Boundary: reserve === window → availableInputTokens === 0, still the FALSE branch (> 0 is false). + const result = willManageContext({ + totalTokens: 1, + contextWindow: 50000, + maxTokens: 50000, + autoCondenseContext: true, + autoCondenseContextPercent: 90, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("manageContext summarizes via the 100% fallback when the reserve >= the window (availableInput <= 0)", async () => { + // Mirror the willManageContext edge for the manageContext path: reserve >= window forces + // contextPercent to 100 via the denominator guard, so summarization triggers even though + // totalTokens is small relative to the raw window. + const mockSummary = "Reserve-exceeds-window summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.05, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + // contextWindow 50000, maxTokens 60000 → availableInput = -10000 → contextPercent = 100. + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens: 1, + contextWindow: 50000, + maxTokens: 60000, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 80, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + useAvailableInputForContextPercent: true, + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + summary: mockSummary, + prevContextTokens: 1, + }) + + summarizeSpy.mockRestore() + }) + + it("manageContext summarizes based on available input space, end-to-end", async () => { + const mockSummary = "Available-input summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.05, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + const modelInfo = createModelInfo(200000, 64000) + // available input 136000; totalTokens 100000 → ≈ 73.5% ≥ 70% threshold, but only 50% of + // the raw window. The end-to-end path must trigger summarization on the available-input math. + const totalTokens = 100000 + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens, + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + useAvailableInputForContextPercent: true, + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + summary: mockSummary, + prevContextTokens: totalTokens, + }) + + summarizeSpy.mockRestore() + }) + }) + + /** + * Scoping tests: the available-input denominator is opt-in. By default (flag omitted), the gate + * divides by the FULL context window, exactly as every non-vscode-lm provider did before the + * vscode-lm fix. The maxTokens: -1 reserve guard, however, remains global on the default path. + */ + describe("contextPercent denominator is opt-in (default = full window)", () => { + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "Fifth message" }, + ] + + it("willManageContext divides by the full window when the flag is omitted (default)", () => { + // Same inputs as the regression block: contextWindow 200000, reserve 64000, totalTokens 100000. + // Default (full window): 100000 / 200000 = 50% < 70% threshold → false. Under the opt-in + // available-input math it would be ≈ 73.5% and fire — this proves the scoping. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(false) + }) + + it("willManageContext fires on the same inputs when the opt-in flag is true", () => { + // Identical inputs, flag on: available input 136000 → 100000 / 136000 ≈ 73.5% ≥ 70% → true. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("keeps the maxTokens:-1 reserve guard on the default (full-window) path", () => { + // The reserve guard is global, independent of the percent denominator. With auto-condense + // off, only the allowedTokens path can fire: allowedTokens = 100000 * 0.9 - 8192 = 81808; + // totalTokens 85000 > 81808 → true. (A naive `maxTokens || DEFAULT` keeping -1 would break this.) + const result = willManageContext({ + totalTokens: 85000, + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, + autoCondenseContextPercent: 50, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("manageContext does NOT summarize on the default path where the opt-in math would have", async () => { + // contextWindow 200000, reserve 64000, totalTokens 100000. Default full-window percent is + // 50% < 70% threshold, and allowedTokens = 200000 * 0.9 - 64000 = 116000 > 100000, so neither + // condense nor truncation runs. With the opt-in flag this same case summarizes (asserted above + // in the regression block), proving the default path reverts to pre-fix behavior. + const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") + + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + expect(summarizeSpy).not.toHaveBeenCalled() + expect(result).toEqual({ + messages: messagesWithSmallContent, + summary: "", + cost: 0, + prevContextTokens: 100000, + }) + + summarizeSpy.mockRestore() + }) + }) }) diff --git a/src/core/context-management/index.ts b/src/core/context-management/index.ts index 243d7bd79..b4d89487f 100644 --- a/src/core/context-management/index.ts +++ b/src/core/context-management/index.ts @@ -147,6 +147,14 @@ export type WillManageContextOptions = { profileThresholds: Record currentProfileId: string lastMessageTokens: number + /** + * Opt-in: measure the condense percentage against the available input space + * (contextWindow - reserved output) instead of the full context window. Only providers + * whose advertised live window is inflated relative to the usable input ceiling (vscode-lm, + * which exposes the seam via getCondenseContextWindow) set this. All other providers leave it + * undefined and keep dividing by the full context window (original behavior). + */ + useAvailableInputForContextPercent?: boolean } /** @@ -167,16 +175,19 @@ export function willManageContext({ profileThresholds, currentProfileId, lastMessageTokens, + useAvailableInputForContextPercent, }: WillManageContextOptions): boolean { if (!autoCondenseContext) { // When auto-condense is disabled, only truncation can occur - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens return prevContextTokens > allowedTokens } - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens @@ -192,7 +203,20 @@ export function willManageContext({ // Invalid values fall back to global setting (effectiveThreshold already set) } - const contextPercent = (100 * prevContextTokens) / contextWindow + // By default, measure usage against the full context window (original behavior shared by all + // providers). Opt-in (vscode-lm via getCondenseContextWindow) measures against the available + // input space (context window minus the reserved output budget) to match the UI context gauge, + // because that provider's advertised window is inflated relative to its usable input ceiling. + // Reserved output tokens can never hold conversation context. When the reserve is + // unknown/unlimited (e.g., vscode-lm reports -1), fall back to the full context window. + let contextPercent: number + if (useAvailableInputForContextPercent) { + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + } else { + contextPercent = (100 * prevContextTokens) / contextWindow + } return contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens } @@ -229,6 +253,14 @@ export type ContextManagementOptions = { cwd?: string /** Optional controller for file access validation */ rooIgnoreController?: RooIgnoreController + /** + * Opt-in: measure the condense percentage against the available input space + * (contextWindow - reserved output) instead of the full context window. Only providers + * whose advertised live window is inflated relative to the usable input ceiling (vscode-lm, + * which exposes the seam via getCondenseContextWindow) set this. All other providers leave it + * undefined and keep dividing by the full context window (original behavior). + */ + useAvailableInputForContextPercent?: boolean } export type ContextManagementResult = SummarizeResponse & { @@ -262,12 +294,14 @@ export async function manageContext({ filesReadByRoo, cwd, rooIgnoreController, + useAvailableInputForContextPercent, }: ContextManagementOptions): Promise { let error: string | undefined let errorDetails: string | undefined let cost = 0 // Calculate the maximum tokens reserved for response - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS // Estimate tokens for the last message (which is always a user message) const lastMessage = messages[messages.length - 1] @@ -304,7 +338,20 @@ export async function manageContext({ // If no specific threshold is found for the profile, fall back to global setting if (autoCondenseContext) { - const contextPercent = (100 * prevContextTokens) / contextWindow + // By default, measure usage against the full context window (original behavior shared by all + // providers). Opt-in (vscode-lm via getCondenseContextWindow) measures against the available + // input space (context window minus the reserved output budget) to match the UI context gauge, + // because that provider's advertised window is inflated relative to its usable input ceiling. + // Reserved output tokens can never hold conversation context. When the reserve is + // unknown/unlimited (e.g., vscode-lm reports -1), fall back to the full context window. + let contextPercent: number + if (useAvailableInputForContextPercent) { + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + } else { + contextPercent = (100 * prevContextTokens) / contextWindow + } if (contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens) { // Attempt to intelligently condense the context const result = await summarizeConversation({ diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 50d4674fd..81a243545 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -2688,9 +2688,13 @@ export class Task extends EventEmitter implements TaskLike { if (signal.aborted) { reject(new Error("Request cancelled by user")) } else { - signal.addEventListener("abort", () => { - reject(new Error("Request cancelled by user")) - }, { once: true }) + signal.addEventListener( + "abort", + () => { + reject(new Error("Request cancelled by user")) + }, + { once: true }, + ) } }) return await Promise.race([nextPromise, abortPromise]) @@ -3734,7 +3738,14 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the + // inflated live window, so context management runs in line with the context bar. Every other + // provider returns undefined here and falls back to modelInfo.contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow + + // Only vscode-lm implements getCondenseContextWindow, so its presence scopes the + // available-input condense denominator to that provider; all others use the full window. + const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3803,6 +3814,7 @@ export class Task extends EventEmitter implements TaskLike { currentProfileId, metadata, environmentDetails, + useAvailableInputForContextPercent, }) if (truncateResult.messages !== this.apiConversationHistory) { @@ -3930,7 +3942,14 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the + // inflated live window, so context management runs in line with the context bar. Every other + // provider returns undefined here and falls back to modelInfo.contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow + + // Only vscode-lm implements getCondenseContextWindow, so its presence scopes the + // available-input condense denominator to that provider; all others use the full window. + const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3955,6 +3974,7 @@ export class Task extends EventEmitter implements TaskLike { profileThresholds, currentProfileId, lastMessageTokens, + useAvailableInputForContextPercent, }) // Send condenseTaskContextStarted BEFORE manageContext to show in-progress indicator @@ -4037,6 +4057,7 @@ export class Task extends EventEmitter implements TaskLike { filesReadByRoo: contextMgmtFilesReadByRoo, cwd: this.cwd, rooIgnoreController: this.rooIgnoreController, + useAvailableInputForContextPercent, }) if (truncateResult.messages !== this.apiConversationHistory) { await this.overwriteApiConversationHistory(truncateResult.messages) @@ -4191,10 +4212,14 @@ export class Task extends EventEmitter implements TaskLike { const iterator = stream[Symbol.asyncIterator]() // Set up abort handling - when the signal is aborted, clean up the controller reference - abortSignal.addEventListener("abort", () => { - console.log(`[Task#${this.taskId}.${this.instanceId}] AbortSignal triggered for current request`) - this.currentRequestAbortController = undefined - }, { once: true }) + abortSignal.addEventListener( + "abort", + () => { + console.log(`[Task#${this.taskId}.${this.instanceId}] AbortSignal triggered for current request`) + this.currentRequestAbortController = undefined + }, + { once: true }, + ) try { // Awaiting first chunk to see if it will throw an error. @@ -4206,9 +4231,13 @@ export class Task extends EventEmitter implements TaskLike { if (abortSignal.aborted) { reject(new Error("Request cancelled by user")) } else { - abortSignal.addEventListener("abort", () => { - reject(new Error("Request cancelled by user")) - }, { once: true }) + abortSignal.addEventListener( + "abort", + () => { + reject(new Error("Request cancelled by user")) + }, + { once: true }, + ) } }) diff --git a/webview-ui/src/components/chat/TaskHeader.tsx b/webview-ui/src/components/chat/TaskHeader.tsx index 4ddf5ef35..927d3d057 100644 --- a/webview-ui/src/components/chat/TaskHeader.tsx +++ b/webview-ui/src/components/chat/TaskHeader.tsx @@ -76,7 +76,8 @@ const TaskHeader = ({ : 0, [model, modelId, apiConfiguration], ) - const reservedForOutput = maxTokens || 0 + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 const condenseButton = ( { // Should show 0% when available input space is 0 expect(screen.getByText("0%")).toBeInTheDocument() }) + + it("should treat a negative maxTokens (vscode-lm reports -1) as zero reserve", () => { + // vscode-lm reports maxTokens: -1 (unlimited). A naive `maxTokens || 0` keeps -1, + // which would inflate available input space and skew the percentage. The guard must + // treat -1 as a zero reserve so available space == contextWindow. + // contextTokens = 250, contextWindow = 1000, reservedForOutput = 0 + // Percentage = 250 / 1000 * 100 = 25% + mockModelInfo = { contextWindow: 1000, maxTokens: -1 } + mockMaxOutputTokens = -1 + + renderTaskHeader({ contextTokens: 250 }) + + expect(screen.getByText("25%")).toBeInTheDocument() + }) }) }) diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts index 0dc42129c..3ffe85e14 100644 --- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts +++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts @@ -14,6 +14,8 @@ import { minimaxDefaultModelId, minimaxModels, openRouterDefaultModelId, + vscodeLlmModels, + vscodeLlmDefaultModelId, } from "@roo-code/types" import { useSelectedModel } from "../useSelectedModel" @@ -772,4 +774,77 @@ describe("useSelectedModel", () => { expect(result.current.info).toEqual(minimaxModels["MiniMax-M2.7"]) }) }) + + describe("vscode-lm provider", () => { + beforeEach(() => { + mockUseRouterModels.mockReturnValue({ + data: { + openrouter: {}, + requesty: {}, + litellm: {}, + }, + isLoading: false, + isError: false, + } as any) + + mockUseOpenRouterModelProviders.mockReturnValue({ + data: {}, + isLoading: false, + isError: false, + } as any) + }) + + it("resolves a listed family's contextWindow to its maxInputTokens", () => { + const family = vscodeLlmDefaultModelId + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${family}`) + // The bar and the condense gate share one source of truth: contextWindow === maxInputTokens. + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[family].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + + it("pins a divergent family's contextWindow to maxInputTokens, not its advertised window", () => { + // claude-opus-4.8 is the row where contextWindow (679560) and maxInputTokens (197897) DIFFER. + // The hook must surface maxInputTokens so the bar matches the condense gate; a field swap to + // the advertised contextWindow would be caught here (unlike the default model where they match). + const family = "claude-opus-4.8" + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${family}`) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[family].maxInputTokens) // 197897 + expect(result.current.info?.contextWindow).not.toBe(vscodeLlmModels[family].contextWindow) // NOT 679560 + expect(result.current.info?.supportsImages).toBe(false) + }) + + it("falls back to the default model's window for an unlisted family (NOT 128000)", () => { + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family: "totally-unknown-family" }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + // On a family miss we must NOT fall back to openAiModelInfoSaneDefaults' 128000 window, + // which would diverge from the gate. Instead, use the default model's maxInputTokens. + expect(result.current.info?.contextWindow).not.toBe(128000) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[vscodeLlmDefaultModelId].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + }) }) diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index d3ebb6c0d..a5940ba7d 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -310,8 +310,22 @@ function getSelectedModel({ ? `${apiConfiguration.vsCodeLmModelSelector.vendor}/${apiConfiguration.vsCodeLmModelSelector.family}` : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId - const info = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] - return { id, info: { ...openAiModelInfoSaneDefaults, ...info, supportsImages: false } } // VSCode LM API currently doesn't support images. + // On a family miss, fall back to the default model entry instead of openAiModelInfoSaneDefaults, + // whose 128K contextWindow would diverge from the gate and make the bar read >100% while + // auto-condense never fires (the gate uses the live window). + const listedModel = + vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] ?? vscodeLlmModels[vscodeLlmDefaultModelId] + // Set contextWindow = maxInputTokens so the UI bar matches what the condense gate uses for + // vscode-lm. The gate's primary window comes from getCondenseContextWindow() (which returns the + // static-table maxInputTokens); getModel().info.contextWindow is only the fallback. Sharing + // maxInputTokens keeps the bar and the gate on a single source of truth. + const info: ModelInfo = { + ...openAiModelInfoSaneDefaults, + ...listedModel, + contextWindow: listedModel.maxInputTokens, + supportsImages: false, // VSCode LM API currently doesn't support images. + } + return { id, info } } case "sambanova": { const id = apiConfiguration.apiModelId ?? defaultModelId