diff --git a/scripts/test_provider_token_budgets.py b/scripts/test_provider_token_budgets.py new file mode 100644 index 0000000..3bcfcbf --- /dev/null +++ b/scripts/test_provider_token_budgets.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""Tests for per-provider token-budget overrides. + +Problem this solves: gpt-5 charges reasoning tokens against +`max_completion_tokens`. At the reasoning-class default of 2048 it +routinely runs out before emitting visible answer text on multi-round +work like confer / debate. The fix is a per-provider override layer +sitting between the global CFG override and the reasoning / non- +reasoning fall-back tables. + +Precedence the test enforces (highest first): + 1. CFG.token_budgets[purpose] + 2. CFG.token_budgets_by_provider[provider][purpose] + 3. _PROVIDER_TOKEN_BUDGETS[provider][purpose] (shipped defaults) + 4. _NON_REASONING_TOKEN_BUDGETS[purpose] (non-reasoning model) + 5. _DEFAULT_TOKEN_BUDGETS[purpose] (reasoning-safe default) +""" + +from __future__ import annotations + +import json +import os +import sys +import tempfile +from pathlib import Path + + +def main() -> int: + here = Path(__file__).resolve().parents[1] + sys.path.insert(0, str(here / "servers" / "python")) + + tmp = Path(tempfile.mkdtemp()) + pricing = tmp / "pricing.json" + pricing.write_text(json.dumps({ + "openai": {"gpt-test": {"prompt_per_1k": 0.0001, "completion_per_1k": 0.0003, "cached_per_1k": 0.00005}, + "gpt-5": {"prompt_per_1k": 0.001, "completion_per_1k": 0.003, "cached_per_1k": 0.0005}}, + "anthropic": {"claude-test": {"prompt_per_1k": 0.003, "completion_per_1k": 0.015, "cached_per_1k": 0.0003}}, + "xai": {"grok-test": {"prompt_per_1k": 0.005, "completion_per_1k": 0.015, "cached_per_1k": 0.0025}, + "grok-4-latest": {"prompt_per_1k": 0.005, "completion_per_1k": 0.015, "cached_per_1k": 0.0025}}, + })) + os.environ["CROSSCHECK_PRICING_PATH"] = str(pricing) + os.environ.pop("CROSSCHECK_REJECT_CONFIG_DRIFT", None) + + import crosscheck_server as srv + + srv.CFG = dict(srv.CFG) + srv.CFG["session_db"] = str(tmp / "sessions.db") + srv.CFG["transcript_dir"] = str(tmp / "transcripts") + srv.CFG["cache"] = {"enabled": False} + srv.CFG["node_cache"] = {"enabled": False} + srv.CFG["prompt_adapters"] = {"enabled": False} + srv.CFG["config_pinning"] = {"reject_drift": False} + srv.CFG.pop("token_budgets", None) + srv.CFG.pop("token_budgets_by_provider", None) + srv.TRANSCRIPT_DIR = Path(srv.CFG["transcript_dir"]) + srv._DB_INIT_DONE = False + srv._FTS5_AVAILABLE = None + srv._PRICING_CACHE = None + srv.PRICING_PATH = pricing + srv._CONFIG_PIN_STARTUP_DONE = True + + # ------------------------------------------------------------------ + # 1) Shipped openai confer/debate override = 6144 (the actual ship) + # ------------------------------------------------------------------ + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 6144 + assert srv._budget_for_purpose("debate", "openai", "gpt-5") == 6144 + # Non-reasoning OpenAI model: the override still applies — it's keyed + # on provider, not model class. (The whole point is that the provider + # has a quirky reasoning-token accounting model.) + assert srv._budget_for_purpose("confer", "openai", "gpt-test") == 6144 + + # Non-overridden purposes for OpenAI fall through to the tier table. + # audit / synth on a NON-reasoning model -> non-reasoning ceiling. + assert srv._budget_for_purpose("audit", "openai", "gpt-test") == 768 + assert srv._budget_for_purpose("synth", "openai", "gpt-test") == 1024 + # audit / synth on a reasoning model -> reasoning-safe default. + assert srv._budget_for_purpose("audit", "openai", "gpt-5") == 2048 + + # ------------------------------------------------------------------ + # 2) Other providers are unaffected by the openai shipped override. + # ------------------------------------------------------------------ + assert srv._budget_for_purpose("confer", "anthropic", "claude-test") == 1500 # non-reasoning ceiling + assert srv._budget_for_purpose("debate", "anthropic", "claude-test") == 1500 + assert srv._budget_for_purpose("confer", "anthropic", "claude-opus-4-7") == 2048 # reasoning default + assert srv._budget_for_purpose("confer", "xai", "grok-4-latest") == 1500 + + # ------------------------------------------------------------------ + # 3) Precedence: global CFG.token_budgets beats the per-provider override + # ------------------------------------------------------------------ + srv.CFG["token_budgets"] = {"confer": 999} + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 999 + # And it still beats the provider override for OTHER models too: + assert srv._budget_for_purpose("confer", "anthropic", "claude-test") == 999 + srv.CFG.pop("token_budgets", None) + + # ------------------------------------------------------------------ + # 4) Precedence: CFG.token_budgets_by_provider beats shipped defaults + # ------------------------------------------------------------------ + srv.CFG["token_budgets_by_provider"] = { + "openai": {"confer": 8192}, # bump higher than the shipped 6144 + "anthropic": {"debate": 4096}, # add an override anthropic didn't have + } + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 8192 + # Anthropic now has a debate override even though there's no shipped one + assert srv._budget_for_purpose("debate", "anthropic", "claude-opus-4-7") == 4096 + # Non-overridden purpose on the same provider still falls through + assert srv._budget_for_purpose("audit", "openai", "gpt-5") == 2048 + # And the global CFG.token_budgets still wins over the per-provider one + srv.CFG["token_budgets"] = {"confer": 500} + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 500 + srv.CFG.pop("token_budgets", None) + srv.CFG.pop("token_budgets_by_provider", None) + + # ------------------------------------------------------------------ + # 5) Operator-supplied override of 0 / negative is ignored + # ------------------------------------------------------------------ + srv.CFG["token_budgets_by_provider"] = {"openai": {"confer": 0}} + # Falls through to the SHIPPED override (6144), not the table below it + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 6144 + srv.CFG["token_budgets_by_provider"] = {"openai": {"confer": -1}} + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 6144 + srv.CFG.pop("token_budgets_by_provider", None) + + # ------------------------------------------------------------------ + # 6) No provider / no model -> reasoning-safe defaults (unchanged) + # ------------------------------------------------------------------ + assert srv._budget_for_purpose("confer") == 2048 + assert srv._budget_for_purpose("debate") == 2048 + assert srv._budget_for_purpose("audit") == 2048 + + # ------------------------------------------------------------------ + # 7) End-to-end: the wire payload for openai confer carries 6144 + # ------------------------------------------------------------------ + srv.ENV = dict(srv.ENV) + srv.ENV["OPENAI_API_KEY"] = "stub"; srv.ENV["OPENAI_MODEL"] = "gpt-5" + srv.ENV["ANTHROPIC_API_KEY"] = "stub"; srv.ENV["ANTHROPIC_MODEL"] = "claude-opus-4-7" + srv.ALL_PROVIDERS = srv.build_providers() + + captured: list[dict] = [] + def fake_post(url, h, b, **kw): + captured.append({"url": url, "body": b}) + if "openai.com" in url: + return ({"choices": [{"message": {"content": "ok"}}], + "usage": {"prompt_tokens": 30, "completion_tokens": 10}}, 1) + return ({"content": [{"type": "text", "text": "ok"}], + "usage": {"input_tokens": 30, "output_tokens": 10}}, 1) + srv._http_post_resilient = fake_post + + import time as _t + # Caller passes a huge max_tokens (8000); the budget should clamp it to + # 6144 because openai+confer trips the shipped override. + srv._ask_one(srv.ALL_PROVIDERS["openai"], + [{"role": "user", "content": "hi"}], + deadline=_t.monotonic() + 10, max_tokens=8000, purpose="confer") + sent = captured[-1]["body"] + # gpt-5 is reasoning class -> openai uses max_completion_tokens, not max_tokens + assert sent.get("max_completion_tokens") == 6144, sent + + # Same call on Anthropic confer (no provider override) clamps to the + # reasoning default of 2048. + captured.clear() + srv._ask_one(srv.ALL_PROVIDERS["anthropic"], + [{"role": "user", "content": "hi"}], + deadline=_t.monotonic() + 10, max_tokens=8000, purpose="confer") + assert captured[-1]["body"].get("max_tokens") == 2048, captured[-1] + + print("OK: test_provider_token_budgets") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/servers/python/crosscheck_server.py b/servers/python/crosscheck_server.py index 9cec871..6f5cbee 100755 --- a/servers/python/crosscheck_server.py +++ b/servers/python/crosscheck_server.py @@ -2517,18 +2517,56 @@ def _fts_index_transcript(kind: str, payload: dict, path: str, ts_ms: int) -> No "solve": 2048, } +# Provider-specific overrides — applied AFTER the explicit CFG override but +# BEFORE the reasoning / non-reasoning fall-back. Use this when a particular +# provider's reasoning budget eats `max_completion_tokens` so aggressively +# that the standard 2048 leaves no room for the visible answer. +# +# OpenAI: gpt-5 charges reasoning tokens against `max_completion_tokens` and +# can spend 1500-3000 of them on internal thinking before the visible reply. +# At 2048 the visible reply gets MAX_TOKENS-truncated mid-sentence on multi- +# round work like confer / debate. 6144 = ~3k reasoning + ~3k visible answer +# is empirically the bottom of the "always lands cleanly" zone. +_PROVIDER_TOKEN_BUDGETS: dict[str, dict[str, int]] = { + "openai": { + "confer": 6144, + "debate": 6144, + }, +} + def _budget_for_purpose(purpose: str, provider: str | None = None, model: str | None = None) -> int | None: - """Return the configured ceiling for `purpose`. When provider+model are - supplied AND that model is NOT reasoning-class, prefers the (smaller) - non-reasoning ceiling — restoring the PR #7 cost savings for models - that don't need the bigger headroom. + """Return the configured ceiling for `purpose`. Precedence (highest to + lowest): + + 1. `CFG.token_budgets[purpose]` — global caller override + 2. `CFG.token_budgets_by_provider[provider][purpose]` — per-provider + operator override (no code changes needed) + 3. `_PROVIDER_TOKEN_BUDGETS[provider][purpose]` — shipped per-provider + overrides (e.g. openai confer/debate; see comment above the table) + 4. `_NON_REASONING_TOKEN_BUDGETS[purpose]` — non-reasoning model + 5. `_DEFAULT_TOKEN_BUDGETS[purpose]` — reasoning-safe default Caller uses min(default_from_token_cap, this_ceiling).""" custom = (CFG.get("token_budgets") or {}).get(purpose) if isinstance(custom, int) and custom > 0: return int(custom) + + if isinstance(provider, str) and provider: + # Operator-supplied per-provider override (no code changes). + cfg_by_provider = (CFG.get("token_budgets_by_provider") or {}) + op_table = cfg_by_provider.get(provider) if isinstance(cfg_by_provider, dict) else None + if isinstance(op_table, dict): + op_val = op_table.get(purpose) + if isinstance(op_val, int) and op_val > 0: + return int(op_val) + # Shipped per-provider override. + shipped = _PROVIDER_TOKEN_BUDGETS.get(provider) or {} + shipped_val = shipped.get(purpose) + if isinstance(shipped_val, int) and shipped_val > 0: + return int(shipped_val) + # Tier-aware: non-reasoning models get the smaller ceiling. When we # don't know the model (helper called without arguments) fall through # to the reasoning-safe default so we never starve a reasoning auditor.