From a800fc8c5143077ec3d4b1a26e4ce93283765255 Mon Sep 17 00:00:00 2001 From: Arnab Date: Wed, 22 Apr 2026 08:45:39 -0700 Subject: [PATCH] feat(context): lower compaction threshold for Claude-subscription safety Default CONTEXT_WINDOW drops from 1M to 200K, threshold from 0.8 to 0.75 (effective budget 150K tokens). At the old 800K budget a multi-PR session could exhaust a Claude 5-hour rate-limit window before compaction ever triggered. - Added env overrides: MAX_CONTEXT_WINDOW, MAX_COMPACT_THRESHOLD, MAX_KEEP_RECENT - Log effective values once per process (first transformContext call) - Documented in README under "Context sizing" - .env.example updated Providers with cheap long context (Gemini direct, 1M-context models) can raise MAX_CONTEXT_WINDOW back up. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 7 +++++++ README.md | 19 +++++++++++++++++++ src/context.ts | 35 +++++++++++++++++++++++++++++++---- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index 4a8f014..01feb79 100644 --- a/.env.example +++ b/.env.example @@ -22,3 +22,10 @@ GPU_HOST= GPU_SHUTDOWN_TOKEN= MAX_A2A_URL= IOS_DEVICE_ID= + +# Context sizing — compact history before it eats the model's rate limit. +# Defaults target Claude subscription safety (150K token budget before compact). +# For Gemini direct or 1M-context models, raise MAX_CONTEXT_WINDOW. +MAX_CONTEXT_WINDOW=200000 +MAX_COMPACT_THRESHOLD=0.75 +MAX_KEEP_RECENT=6 diff --git a/README.md b/README.md index 2c79dbc..b7622d9 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,25 @@ Requirements: - `claude` CLI installed and authenticated on the host - `ANTHROPIC_BASE_URL` should point to your AgentWeave proxy if you want subagent LLM calls visible in AgentWeave +### Context sizing + +Max automatically compacts old history once the running token estimate crosses a threshold. Defaults target a Claude subscription, where every input token counts against the 5-hour rate limit: + +| Env var | Default | Meaning | +|---|---|---| +| `MAX_CONTEXT_WINDOW` | `200000` | Upper bound used for budget math | +| `MAX_COMPACT_THRESHOLD` | `0.75` | Fraction of the window before compaction kicks in (default = 150K tokens) | +| `MAX_KEEP_RECENT` | `6` | Messages always kept intact at the tail | + +For Gemini direct or other cheap-long-context providers, raise the window: + +```bash +MAX_CONTEXT_WINDOW=1000000 +MAX_COMPACT_THRESHOLD=0.8 +``` + +The effective values are logged once on the first `transformContext` call. + ### Development ```bash diff --git a/src/context.ts b/src/context.ts index 3e6feb8..a046aaa 100644 --- a/src/context.ts +++ b/src/context.ts @@ -3,11 +3,37 @@ import type { AssistantMessage, Message } from "@mariozechner/pi-ai"; import { getModel, getEnvApiKey, streamSimple } from "@mariozechner/pi-ai"; import { log } from "./logger.js"; -const CONTEXT_WINDOW = 1_000_000; -const COMPACT_THRESHOLD = 0.8; // compact at 80% +/** + * Context-sizing knobs. + * + * Defaults target a Claude subscription path, where every input token counts + * against the 5-hour rate limit — compact at ~150K before a long session can + * exhaust quota. + * + * Override via env for providers with cheaper long context (e.g. Gemini direct): + * MAX_CONTEXT_WINDOW=1000000 MAX_COMPACT_THRESHOLD=0.8 MAX_KEEP_RECENT=6 + */ +function envNumber(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; + const n = Number(raw); + return Number.isFinite(n) && n > 0 ? n : fallback; +} + +const CONTEXT_WINDOW = envNumber("MAX_CONTEXT_WINDOW", 200_000); +const COMPACT_THRESHOLD = envNumber("MAX_COMPACT_THRESHOLD", 0.75); const TOKEN_LIMIT = Math.floor(CONTEXT_WINDOW * COMPACT_THRESHOLD); -// Keep at least the last N messages untouched during compaction -const KEEP_RECENT = 6; +const KEEP_RECENT = Math.floor(envNumber("MAX_KEEP_RECENT", 6)); + +let loggedConfig = false; +function logConfigOnce(): void { + if (loggedConfig) return; + loggedConfig = true; + log( + "info", + `Context sizing: window=${CONTEXT_WINDOW} threshold=${TOKEN_LIMIT} (${Math.round(COMPACT_THRESHOLD * 100)}%) keepRecent=${KEEP_RECENT}` + ); +} /** Rough token estimate: ~4 chars per token for text, actual usage for assistant messages */ function estimateMessageTokens(msg: AgentMessage): number { @@ -77,6 +103,7 @@ export function getContextStats(messages: AgentMessage[]): ContextStats { * - Replace with a single compact user message containing the summary */ export async function transformContext(messages: AgentMessage[]): Promise { + logConfigOnce(); const totalTokens = messages.reduce((sum, m) => sum + estimateMessageTokens(m), 0); if (totalTokens <= TOKEN_LIMIT) {