diff --git a/AGENTS.md b/AGENTS.md index 05ae519..7c267fd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -22,6 +22,17 @@ runaway costs. - Prefer features that reduce tokens structurally (output caps, compaction) over features that merely expose knobs for users to tune manually. +## Model catalog (`src/models/catalog.ts`) +- A curated, offline list of coding models with pricing, context window, and a + relative coding score. It drives USD cost estimates and priority-based model + selection (`performance` / `cost` / `balanced`). +- Keep it current: when adding/repricing a model, update its entry **and** + `CATALOG_AS_OF`. Anthropic pricing comes from the bundled claude-api reference; + verify Gemini pricing against Google's published rates. Don't guess prices. +- `priority` defaults to `performance`, which preserves the historical default + models (Opus for Anthropic, Gemini 2.5 Pro for Gemini). Don't change the + default without updating the config tests that assert those ids. + ## Boundaries - No business logic. This is a general-purpose tool. - Don't add a second state paradigm or heavy dependencies without a clear reason. diff --git a/README.md b/README.md index 9d1363f..3a126a9 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ In the REPL: type a request, watch it work. Mutating actions (writes, edits, shell commands) prompt for approval unless pre-approved in config. - `/help` — list commands +- `/models` — show known models, pricing, and the active one (see below) - `/improve` — reflect on the session and propose an improvement PR (see below) - `/ [args]` — run a custom command (see below) - `/exit` — quit @@ -84,6 +85,7 @@ CLI flags. { "provider": "anthropic", "model": "claude-opus-4-8", + "priority": "performance", "maxTokens": 16000, "thinking": true, "effort": "high", @@ -128,6 +130,38 @@ the savings where it can: (see `TODO.md`), which will keep input-token counts from compounding across many turns without any user action. +## Model awareness & cost control + +tiny-code ships a small, curated catalog of coding models +(`src/models/catalog.ts`) with each model's pricing, context window, and a +relative coding-aptitude score. It uses this to turn raw token counts into real +money and to pick a model that fits your cost/performance preference. + +- **Dollar cost, not just tokens.** Per-turn usage and the session total show an + estimated USD cost next to the token counts, priced from the active model's + rate — so the bill is visible as you work, not a surprise later. +- **`/models`** lists the catalog (cheapest first) with pricing and scores, + marks the active model, and shows the session's running cost. +- **Priority-driven selection.** When you don't pin a `model`, tiny-code picks + one for you based on `priority`: + + | `priority` | Picks | + | --------------- | ----------------------------------------------------------- | + | `performance` | The most capable model (the default — current behavior). | + | `cost` | The cheapest still-capable model. | + | `balanced` | The best capability-per-dollar among capable models. | + + ```json + { "priority": "balanced" } + ``` + + Or per-session with `TINY_CODE_PRIORITY=cost`. Pinning `model` (config, env, + or `--model`) always overrides the recommendation. + +The catalog is curated and offline (tiny-code has no live model-discovery yet — +see `TODO.md`), so its prices carry an "as of" date; keep it current as vendors +ship new models and change pricing. + ## Self-improvement tiny-code can learn from how it's used. When a session ends (or when you run diff --git a/TODO.md b/TODO.md index 4de30a4..d149cce 100644 --- a/TODO.md +++ b/TODO.md @@ -51,6 +51,15 @@ Save/restore `AgentLoop.getMessages()` to disk; `--resume` to continue a session Pair with the compaction feature above so resumed sessions don't carry a bloated history. +## Live model catalog refresh +The model catalog (`src/models/catalog.ts`) is curated and offline, so its +pricing and model list drift until a human updates them. **Approach:** an opt-in +refresh that pulls current models/pricing from the provider APIs (Anthropic's +`GET /v1/models` for capabilities; a pricing source for rates) and Gemini's +equivalent, caching to disk with the `CATALOG_AS_OF` date. Gate behind a flag so +the default stays offline and deterministic. Pairs with the existing +priority-based selection — fresher data, same `recommendModel` logic. + ## ripgrep-backed grep The `grep` tool currently walks the tree in JS. **Approach:** detect `rg` on PATH and shell out for speed + .gitignore awareness, falling back to the JS diff --git a/src/cli.ts b/src/cli.ts index b664658..2ed0237 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -19,8 +19,10 @@ Options: -h, --help Show this help Environment: - ANTHROPIC_API_KEY Required for the Anthropic provider - GEMINI_API_KEY Required for the Gemini provider + ANTHROPIC_API_KEY Required for the Anthropic provider + GEMINI_API_KEY Required for the Gemini provider + TINY_CODE_PRIORITY performance | cost | balanced — auto-picks a model when + none is pinned (default: performance) `; function main(): void { diff --git a/src/config/load.ts b/src/config/load.ts index d2ce512..f1f1ed0 100644 --- a/src/config/load.ts +++ b/src/config/load.ts @@ -2,9 +2,12 @@ import { readFileSync, existsSync } from 'node:fs'; import { homedir } from 'node:os'; import { join } from 'node:path'; import { z } from 'zod'; +import type { Priority } from '../models/catalog.js'; +import { recommendModel } from '../models/catalog.js'; export type Provider = 'anthropic' | 'gemini'; export type Effort = 'low' | 'medium' | 'high' | 'xhigh' | 'max'; +export type { Priority } from '../models/catalog.js'; /** Auto-approval rules that bypass the interactive permission prompt. */ export interface AllowRules { @@ -19,6 +22,8 @@ export interface AllowRules { export interface ResolvedConfig { provider: Provider; model: string; + /** Cost/performance bias used to auto-pick a model when none is pinned. */ + priority: Priority; anthropicApiKey: string | undefined; geminiApiKey: string | undefined; maxTokens: number; @@ -55,6 +60,7 @@ const FileConfigSchema = z .object({ provider: z.enum(['anthropic', 'gemini']).optional(), model: z.string().optional(), + priority: z.enum(['performance', 'cost', 'balanced']).optional(), maxTokens: z.number().int().positive().optional(), thinking: z.boolean().optional(), effort: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(), @@ -107,8 +113,17 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c file.provider ?? (anthropicApiKey ? 'anthropic' : geminiApiKey ? 'gemini' : 'anthropic'); + const priority: Priority = + (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'performance'; + + // When the user pins a model, honor it. Otherwise let the catalog pick the + // best fit for the cost/performance priority, falling back to a static + // default if the catalog has no entry for the provider. + const pinnedModel = overrides.model ?? env.TINY_CODE_MODEL ?? file.model; const model = - overrides.model ?? env.TINY_CODE_MODEL ?? file.model ?? DEFAULT_MODELS[provider]; + pinnedModel ?? + recommendModel({ provider, priority })?.id ?? + DEFAULT_MODELS[provider]; const maxTokens = env.TINY_CODE_MAX_TOKENS ? Number(env.TINY_CODE_MAX_TOKENS) @@ -124,6 +139,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c return { provider, model, + priority, anthropicApiKey, geminiApiKey, maxTokens, diff --git a/src/index.ts b/src/index.ts index 1215b85..df727ba 100644 --- a/src/index.ts +++ b/src/index.ts @@ -20,7 +20,18 @@ export { PermissionGate } from './permissions/gate.js'; export type { PermissionPrompt, PermissionRequest, PermissionChoice } from './permissions/gate.js'; export { loadConfig } from './config/load.js'; -export type { ResolvedConfig, CliOverrides, Provider, Effort, AllowRules } from './config/load.js'; +export type { ResolvedConfig, CliOverrides, Provider, Effort, Priority, AllowRules } from './config/load.js'; + +export { + MODEL_CATALOG, + CATALOG_AS_OF, + getModelInfo, + estimateCostUsd, + formatUsd, + blendedCostPerMTok, + recommendModel, +} from './models/catalog.js'; +export type { ModelInfo, RecommendOptions } from './models/catalog.js'; export { loadProjectContext } from './config/context.js'; export { loadCommands, renderCommand } from './commands/loader.js'; diff --git a/src/models/catalog.ts b/src/models/catalog.ts new file mode 100644 index 0000000..a5a9c98 --- /dev/null +++ b/src/models/catalog.ts @@ -0,0 +1,132 @@ +import type { Provider } from '../config/load.js'; +import type { Usage } from '../providers/types.js'; + +/** + * How to weigh cost vs. capability when auto-selecting a model. + * - `performance`: most capable model (maximize quality, ignore price) + * - `cost`: cheapest capable model (maximize savings) + * - `balanced`: best capability-per-dollar among genuinely capable models + */ +export type Priority = 'performance' | 'cost' | 'balanced'; + +/** + * Curated facts about a coding model. Pricing is USD per 1,000,000 tokens for + * the standard (non-cached, ≤200K-context) tier — the common case for an + * interactive coding session. `codingScore` is a curated 0–100 estimate of + * relative aptitude on coding/agentic tasks, used only to rank models against + * each other; it is not a vendor benchmark. + */ +export interface ModelInfo { + id: string; + provider: Provider; + label: string; + inputPricePerMTok: number; + outputPricePerMTok: number; + contextWindow: number; + codingScore: number; +} + +/** + * The date this catalog's pricing and model list were last verified. Models and + * prices move; keep this current when updating entries. Anthropic figures come + * from the bundled claude-api reference; Gemini figures from Google's published + * API pricing. + */ +export const CATALOG_AS_OF = '2026-06-08'; + +/** + * The known coding models, newest/most-capable first within each provider. + * Keep this list curated — tiny-code runs offline-first, so it can't discover + * models at runtime. Update it (and {@link CATALOG_AS_OF}) as vendors ship. + */ +export const MODEL_CATALOG: ModelInfo[] = [ + // Anthropic — pricing per the claude-api model table. + { id: 'claude-opus-4-8', provider: 'anthropic', label: 'Claude Opus 4.8', inputPricePerMTok: 5, outputPricePerMTok: 25, contextWindow: 1_000_000, codingScore: 99 }, + { id: 'claude-opus-4-7', provider: 'anthropic', label: 'Claude Opus 4.7', inputPricePerMTok: 5, outputPricePerMTok: 25, contextWindow: 1_000_000, codingScore: 96 }, + { id: 'claude-opus-4-6', provider: 'anthropic', label: 'Claude Opus 4.6', inputPricePerMTok: 5, outputPricePerMTok: 25, contextWindow: 1_000_000, codingScore: 93 }, + { id: 'claude-sonnet-4-6', provider: 'anthropic', label: 'Claude Sonnet 4.6', inputPricePerMTok: 3, outputPricePerMTok: 15, contextWindow: 1_000_000, codingScore: 88 }, + { id: 'claude-haiku-4-5', provider: 'anthropic', label: 'Claude Haiku 4.5', inputPricePerMTok: 1, outputPricePerMTok: 5, contextWindow: 200_000, codingScore: 75 }, + + // Gemini — standard-tier pricing (prompts ≤200K tokens) from Google AI pricing. + { id: 'gemini-2.5-pro', provider: 'gemini', label: 'Gemini 2.5 Pro', inputPricePerMTok: 1.25, outputPricePerMTok: 10, contextWindow: 1_048_576, codingScore: 90 }, + { id: 'gemini-2.5-flash', provider: 'gemini', label: 'Gemini 2.5 Flash', inputPricePerMTok: 0.3, outputPricePerMTok: 2.5, contextWindow: 1_048_576, codingScore: 72 }, + { id: 'gemini-2.5-flash-lite', provider: 'gemini', label: 'Gemini 2.5 Flash-Lite', inputPricePerMTok: 0.1, outputPricePerMTok: 0.4, contextWindow: 1_048_576, codingScore: 55 }, +]; + +/** Look up catalog facts for a model id, or `undefined` if it's not tracked. */ +export function getModelInfo(id: string): ModelInfo | undefined { + return MODEL_CATALOG.find((m) => m.id === id); +} + +/** Estimate the USD cost of a token usage given a model's pricing. */ +export function estimateCostUsd(usage: Usage, info: ModelInfo): number { + return ( + (usage.inputTokens / 1_000_000) * info.inputPricePerMTok + + (usage.outputTokens / 1_000_000) * info.outputPricePerMTok + ); +} + +/** Format a USD amount with precision that stays readable for tiny costs. */ +export function formatUsd(amount: number): string { + return `$${amount.toFixed(amount < 1 ? 4 : 2)}`; +} + +/** + * Coding sessions are input-heavy (history is resent every turn), so blend + * pricing 80% input / 20% output to compare models on a single cost number. + */ +export function blendedCostPerMTok(info: ModelInfo): number { + return info.inputPricePerMTok * 0.8 + info.outputPricePerMTok * 0.2; +} + +/** + * Minimum coding aptitude to consider, per priority. Keeps `balanced`/`cost` + * from collapsing onto the cheapest-but-weakest model — score-per-dollar always + * favors the cheapest, so a capability floor is what makes the tradeoff useful. + */ +const DEFAULT_MIN_SCORE: Record = { + performance: 0, + balanced: 80, + cost: 60, +}; + +export interface RecommendOptions { + provider: Provider; + priority: Priority; + /** Reject models below this coding score. Defaults per-priority. */ + minCodingScore?: number; + /** Reject models whose context window is smaller than this. */ + minContextWindow?: number; +} + +/** + * Pick the model that best fits a cost/performance priority. Returns the single + * best candidate, or `undefined` if the constraints exclude every model for the + * provider (callers should fall back to a static default). + */ +export function recommendModel(opts: RecommendOptions): ModelInfo | undefined { + const minScore = opts.minCodingScore ?? DEFAULT_MIN_SCORE[opts.priority]; + const candidates = MODEL_CATALOG.filter( + (m) => + m.provider === opts.provider && + m.codingScore >= minScore && + (opts.minContextWindow === undefined || m.contextWindow >= opts.minContextWindow), + ); + if (candidates.length === 0) return undefined; + + const score = (m: ModelInfo): number => { + switch (opts.priority) { + case 'performance': + // Highest aptitude; break ties toward the cheaper option. + return m.codingScore - blendedCostPerMTok(m) / 1000; + case 'cost': + // Cheapest; break ties toward the more capable option. + return -blendedCostPerMTok(m) + m.codingScore / 1000; + case 'balanced': + // Best capability per dollar. + return m.codingScore / blendedCostPerMTok(m); + } + }; + + return candidates.reduce((best, m) => (score(m) > score(best) ? m : best)); +} diff --git a/src/repl.ts b/src/repl.ts index f660036..2130e8d 100644 --- a/src/repl.ts +++ b/src/repl.ts @@ -13,10 +13,20 @@ import { buildSystemPrompt } from './agent/systemPrompt.js'; import { loadCommands, renderCommand } from './commands/loader.js'; import type { Command } from './commands/types.js'; import { runImprovement } from './improve/run.js'; +import { + MODEL_CATALOG, + CATALOG_AS_OF, + getModelInfo, + estimateCostUsd, + formatUsd, + blendedCostPerMTok, +} from './models/catalog.js'; +import type { Usage } from './providers/types.js'; function printHelp(commands: Map): void { console.log(pc.bold('\nBuilt-in:')); console.log(' /help Show this help'); + console.log(' /models Show known models, pricing, and the active one'); console.log(' /improve Reflect on this session and propose an improvement PR'); console.log(' /exit, /quit Leave the session'); if (commands.size > 0) { @@ -28,6 +38,35 @@ function printHelp(commands: Map): void { } } +/** Show the model catalog with pricing, ranked cheapest-first, marking the + * active model and the live session cost so cost/performance is visible. */ +function printModels(activeModel: string, priority: string, usage: Usage): void { + console.log( + pc.bold(`\nModels`) + + pc.dim(` · priority: ${priority} · pricing per 1M tokens · as of ${CATALOG_AS_OF}`), + ); + const ranked = [...MODEL_CATALOG].sort((a, b) => blendedCostPerMTok(a) - blendedCostPerMTok(b)); + for (const m of ranked) { + const active = m.id === activeModel; + const marker = active ? pc.green('●') : ' '; + const id = active ? pc.bold(m.id.padEnd(22)) : m.id.padEnd(22); + const detail = pc.dim( + `in $${m.inputPricePerMTok}/out $${m.outputPricePerMTok} score ${m.codingScore}`, + ); + console.log(`${marker} ${id} ${detail}`); + } + const info = getModelInfo(activeModel); + if (info && (usage.inputTokens > 0 || usage.outputTokens > 0)) { + console.log( + pc.dim( + `\nThis session: ↑ ${usage.inputTokens.toLocaleString('en-US')} ↓ ${usage.outputTokens.toLocaleString('en-US')} tokens ≈ ${formatUsd(estimateCostUsd(usage, info))}`, + ), + ); + } else if (!info) { + console.log(pc.dim(`\n(${activeModel} is not in the catalog — no cost estimate available.)`)); + } +} + export async function startRepl(overrides: CliOverrides): Promise { const cwd = process.cwd(); const config = loadConfig(overrides, cwd); @@ -51,7 +90,8 @@ export async function startRepl(overrides: CliOverrides): Promise { }); const gate = new PermissionGate(config.allow, prompt); - const ui = createTerminalUI(); + const modelInfo = getModelInfo(config.model); + const ui = createTerminalUI(modelInfo); const agent = new AgentLoop({ provider, registry, @@ -100,8 +140,12 @@ export async function startRepl(overrides: CliOverrides): Promise { } }; + const priceTag = modelInfo + ? ` · $${modelInfo.inputPricePerMTok}/$${modelInfo.outputPricePerMTok} per 1M in/out` + : ''; console.log( - pc.bold('tiny-code') + pc.dim(` · ${provider.name}:${provider.model} · ${cwd}`), + pc.bold('tiny-code') + + pc.dim(` · ${provider.name}:${provider.model}${priceTag} · ${cwd}`), ); if (projectContext.trim().length > 0) { console.log(pc.dim('Loaded project context.')); @@ -124,6 +168,11 @@ export async function startRepl(overrides: CliOverrides): Promise { ask(); return; } + if (input === '/models') { + printModels(config.model, config.priority, agent.getUsage()); + ask(); + return; + } if (input === '/improve') { if (config.improve.enabled) { await improve(); @@ -164,8 +213,11 @@ export async function startRepl(overrides: CliOverrides): Promise { const usage = agent.getUsage(); if (usage.inputTokens > 0 || usage.outputTokens > 0) { const fmtN = (n: number) => n.toLocaleString('en-US'); + const cost = modelInfo ? ` ≈ ${formatUsd(estimateCostUsd(usage, modelInfo))}` : ''; console.log( - pc.dim(`\nSession: ↑ ${fmtN(usage.inputTokens)} ↓ ${fmtN(usage.outputTokens)} tokens total`), + pc.dim( + `\nSession: ↑ ${fmtN(usage.inputTokens)} ↓ ${fmtN(usage.outputTokens)} tokens total${cost}`, + ), ); } console.log(pc.dim('Bye.')); diff --git a/src/ui/render.ts b/src/ui/render.ts index 1364ddd..0f613bf 100644 --- a/src/ui/render.ts +++ b/src/ui/render.ts @@ -1,6 +1,8 @@ import pc from 'picocolors'; import type { AgentUI } from '../agent/loop.js'; import type { ToolResult } from '../tools/types.js'; +import type { ModelInfo } from '../models/catalog.js'; +import { estimateCostUsd, formatUsd } from '../models/catalog.js'; function preview(name: string, input: unknown): string { const obj = (input ?? {}) as Record; @@ -19,8 +21,11 @@ function truncate(s: string, n: number): string { return oneLine.length > n ? `${oneLine.slice(0, n)}…` : oneLine; } -/** Minimal streaming UI: assistant text inline, compact colored tool summaries. */ -export function createTerminalUI(): AgentUI { +/** + * Minimal streaming UI: assistant text inline, compact colored tool summaries. + * Pass the active model's catalog info to also show a per-turn cost estimate. + */ +export function createTerminalUI(modelInfo?: ModelInfo): AgentUI { let atLineStart = true; const write = (s: string): void => { @@ -51,7 +56,10 @@ export function createTerminalUI(): AgentUI { write(pc.yellow(` ⊘ ${name} denied\n`)); }, onUsage(usage) { - write(pc.dim(` ↑ ${fmtN(usage.inputTokens)} ↓ ${fmtN(usage.outputTokens)} tokens\n`)); + const cost = modelInfo ? ` ${formatUsd(estimateCostUsd(usage, modelInfo))}` : ''; + write( + pc.dim(` ↑ ${fmtN(usage.inputTokens)} ↓ ${fmtN(usage.outputTokens)} tokens${cost}\n`), + ); }, onAssistantEnd() { ensureNewline(); diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts index 2b73592..3d4ee5e 100644 --- a/tests/config/load.test.ts +++ b/tests/config/load.test.ts @@ -9,6 +9,7 @@ const ENV_KEYS = [ 'GEMINI_API_KEY', 'TINY_CODE_PROVIDER', 'TINY_CODE_MODEL', + 'TINY_CODE_PRIORITY', 'TINY_CODE_MAX_TOKENS', 'TINY_CODE_EFFORT', 'TINY_CODE_IMPROVE', @@ -105,6 +106,37 @@ describe('loadConfig', () => { expect(cfg.improve.onSessionEnd).toBe(false); }); + it('defaults to performance priority and the most capable model', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + const cfg = loadConfig({}, cwd); + expect(cfg.priority).toBe('performance'); + expect(cfg.model).toBe('claude-opus-4-8'); + }); + + it('auto-selects a cheaper model when priority is cost', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + process.env.TINY_CODE_PRIORITY = 'cost'; + const cfg = loadConfig({}, cwd); + expect(cfg.priority).toBe('cost'); + expect(cfg.model).toBe('claude-haiku-4-5'); + }); + + it('lets a pinned model win over the priority recommendation', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + const cfg = loadConfig({ model: 'claude-opus-4-8' }, cwd); + expect(cfg.priority).toBe('performance'); + expect(cfg.model).toBe('claude-opus-4-8'); + }); + + it('reads priority from a config file', async () => { + await writeFile( + join(cwd, 'tiny-code.config.json'), + JSON.stringify({ provider: 'gemini', priority: 'balanced' }), + ); + const cfg = loadConfig({}, cwd); + expect(cfg.priority).toBe('balanced'); + }); + it('lets env override the config file model', async () => { await writeFile( join(cwd, 'tiny-code.config.json'), diff --git a/tests/models/catalog.test.ts b/tests/models/catalog.test.ts new file mode 100644 index 0000000..f5fdf7e --- /dev/null +++ b/tests/models/catalog.test.ts @@ -0,0 +1,90 @@ +import { describe, it, expect } from 'vitest'; +import { + MODEL_CATALOG, + getModelInfo, + estimateCostUsd, + formatUsd, + recommendModel, +} from '../../src/models/catalog.js'; + +describe('getModelInfo', () => { + it('returns catalog facts for a known model', () => { + const info = getModelInfo('claude-opus-4-8'); + expect(info?.provider).toBe('anthropic'); + expect(info?.inputPricePerMTok).toBe(5); + expect(info?.outputPricePerMTok).toBe(25); + }); + + it('returns undefined for an unknown model', () => { + expect(getModelInfo('gpt-9')).toBeUndefined(); + }); +}); + +describe('estimateCostUsd', () => { + it('prices input and output tokens against the model rate', () => { + const info = getModelInfo('claude-opus-4-8')!; + // 1M input @ $5 + 200K output @ $25 = 5 + 5 = $10 + const cost = estimateCostUsd({ inputTokens: 1_000_000, outputTokens: 200_000 }, info); + expect(cost).toBeCloseTo(10, 6); + }); + + it('is zero for zero usage', () => { + const info = getModelInfo('claude-haiku-4-5')!; + expect(estimateCostUsd({ inputTokens: 0, outputTokens: 0 }, info)).toBe(0); + }); +}); + +describe('formatUsd', () => { + it('shows 4 decimals for sub-dollar amounts and 2 above', () => { + expect(formatUsd(0.0042)).toBe('$0.0042'); + expect(formatUsd(12.5)).toBe('$12.50'); + }); +}); + +describe('recommendModel', () => { + it('performance picks the most capable model for the provider', () => { + expect(recommendModel({ provider: 'anthropic', priority: 'performance' })?.id).toBe( + 'claude-opus-4-8', + ); + expect(recommendModel({ provider: 'gemini', priority: 'performance' })?.id).toBe( + 'gemini-2.5-pro', + ); + }); + + it('cost picks the cheapest capable model', () => { + expect(recommendModel({ provider: 'anthropic', priority: 'cost' })?.id).toBe( + 'claude-haiku-4-5', + ); + expect(recommendModel({ provider: 'gemini', priority: 'cost' })?.id).toBe('gemini-2.5-flash'); + }); + + it('balanced trades cost against capability without dropping to the weakest', () => { + expect(recommendModel({ provider: 'anthropic', priority: 'balanced' })?.id).toBe( + 'claude-sonnet-4-6', + ); + }); + + it('respects a context-window floor', () => { + // Haiku has a 200K window; requiring 1M forces a larger model. + const picked = recommendModel({ + provider: 'anthropic', + priority: 'cost', + minContextWindow: 1_000_000, + }); + expect(picked?.id).not.toBe('claude-haiku-4-5'); + expect(picked?.contextWindow).toBeGreaterThanOrEqual(1_000_000); + }); + + it('returns undefined when constraints exclude every model', () => { + expect( + recommendModel({ provider: 'anthropic', priority: 'performance', minCodingScore: 1000 }), + ).toBeUndefined(); + }); +}); + +describe('MODEL_CATALOG', () => { + it('has unique model ids', () => { + const ids = MODEL_CATALOG.map((m) => m.id); + expect(new Set(ids).size).toBe(ids.length); + }); +});