From b12503f6a82b5c601bb9c6772873b162e430796c Mon Sep 17 00:00:00 2001 From: Frank Speiser Date: Fri, 29 May 2026 19:11:44 -0400 Subject: [PATCH] Bump OpenAI confer/debate token budget; add per-provider override layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: OpenAI gpt-5 routinely runs out of `max_completion_tokens` mid-answer during `confer` and `debate`. Root cause: gpt-5 charges reasoning tokens against `max_completion_tokens`. At the reasoning- class default of 2048 it spends 1500-3000 tokens on internal thinking and the visible reply gets MAX_TOKENS-truncated. Fix: add a per-provider override layer in `_budget_for_purpose`. The new precedence (highest to lowest) is: 1. `CFG.token_budgets[purpose]` — caller override 2. `CFG.token_budgets_by_provider[provider][purpose]` — per-provider operator override (no code changes needed) 3. `_PROVIDER_TOKEN_BUDGETS[provider][purpose]` — shipped per-provider overrides (this PR seeds openai.confer / openai.debate = 6144) 4. `_NON_REASONING_TOKEN_BUDGETS[purpose]` — non-reasoning model 5. `_DEFAULT_TOKEN_BUDGETS[purpose]` — reasoning default Shipped seed values: - openai.confer = 6144 - openai.debate = 6144 6144 = ~3k headroom for gpt-5 reasoning + ~3k room for the visible answer. Other providers are unaffected (anthropic / xai / gemini / mistral / groq / deepseek all fall through to the existing reasoning- or non-reasoning ceilings). Anti-foot-gun: per-provider override values <= 0 are ignored (fall through to the next tier). The override is keyed on the provider name (not model class) because the issue is specifically how OpenAI's API accounts for reasoning tokens — even gpt-test (a non-reasoning OpenAI stub) gets the higher ceiling, since the threat model is provider- behavior rather than model-class. Tests (scripts/test_provider_token_budgets.py): - Shipped openai confer/debate = 6144 (reasoning + non-reasoning models) - Non-overridden purposes still respect tier ceilings (audit/synth fall to non-reasoning table for gpt-test, reasoning default for gpt-5) - Other providers unaffected by the openai override - CFG.token_budgets[purpose] beats the per-provider override - CFG.token_budgets_by_provider beats the shipped defaults; can also add overrides for providers that don't have shipped ones - Operator-supplied 0 / negative fall through to the next tier - No provider/model -> reasoning-safe defaults unchanged - End-to-end: openai confer wire payload carries max_completion_tokens=6144 (not 2048); anthropic confer still 2048 Full suite (38 scripts) passes. Co-Authored-By: Claude Opus 4.7 --- scripts/test_provider_token_budgets.py | 172 +++++++++++++++++++++++++ servers/python/crosscheck_server.py | 46 ++++++- 2 files changed, 214 insertions(+), 4 deletions(-) create mode 100644 scripts/test_provider_token_budgets.py diff --git a/scripts/test_provider_token_budgets.py b/scripts/test_provider_token_budgets.py new file mode 100644 index 0000000..3bcfcbf --- /dev/null +++ b/scripts/test_provider_token_budgets.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""Tests for per-provider token-budget overrides. + +Problem this solves: gpt-5 charges reasoning tokens against +`max_completion_tokens`. At the reasoning-class default of 2048 it +routinely runs out before emitting visible answer text on multi-round +work like confer / debate. The fix is a per-provider override layer +sitting between the global CFG override and the reasoning / non- +reasoning fall-back tables. + +Precedence the test enforces (highest first): + 1. CFG.token_budgets[purpose] + 2. CFG.token_budgets_by_provider[provider][purpose] + 3. _PROVIDER_TOKEN_BUDGETS[provider][purpose] (shipped defaults) + 4. _NON_REASONING_TOKEN_BUDGETS[purpose] (non-reasoning model) + 5. _DEFAULT_TOKEN_BUDGETS[purpose] (reasoning-safe default) +""" + +from __future__ import annotations + +import json +import os +import sys +import tempfile +from pathlib import Path + + +def main() -> int: + here = Path(__file__).resolve().parents[1] + sys.path.insert(0, str(here / "servers" / "python")) + + tmp = Path(tempfile.mkdtemp()) + pricing = tmp / "pricing.json" + pricing.write_text(json.dumps({ + "openai": {"gpt-test": {"prompt_per_1k": 0.0001, "completion_per_1k": 0.0003, "cached_per_1k": 0.00005}, + "gpt-5": {"prompt_per_1k": 0.001, "completion_per_1k": 0.003, "cached_per_1k": 0.0005}}, + "anthropic": {"claude-test": {"prompt_per_1k": 0.003, "completion_per_1k": 0.015, "cached_per_1k": 0.0003}}, + "xai": {"grok-test": {"prompt_per_1k": 0.005, "completion_per_1k": 0.015, "cached_per_1k": 0.0025}, + "grok-4-latest": {"prompt_per_1k": 0.005, "completion_per_1k": 0.015, "cached_per_1k": 0.0025}}, + })) + os.environ["CROSSCHECK_PRICING_PATH"] = str(pricing) + os.environ.pop("CROSSCHECK_REJECT_CONFIG_DRIFT", None) + + import crosscheck_server as srv + + srv.CFG = dict(srv.CFG) + srv.CFG["session_db"] = str(tmp / "sessions.db") + srv.CFG["transcript_dir"] = str(tmp / "transcripts") + srv.CFG["cache"] = {"enabled": False} + srv.CFG["node_cache"] = {"enabled": False} + srv.CFG["prompt_adapters"] = {"enabled": False} + srv.CFG["config_pinning"] = {"reject_drift": False} + srv.CFG.pop("token_budgets", None) + srv.CFG.pop("token_budgets_by_provider", None) + srv.TRANSCRIPT_DIR = Path(srv.CFG["transcript_dir"]) + srv._DB_INIT_DONE = False + srv._FTS5_AVAILABLE = None + srv._PRICING_CACHE = None + srv.PRICING_PATH = pricing + srv._CONFIG_PIN_STARTUP_DONE = True + + # ------------------------------------------------------------------ + # 1) Shipped openai confer/debate override = 6144 (the actual ship) + # ------------------------------------------------------------------ + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 6144 + assert srv._budget_for_purpose("debate", "openai", "gpt-5") == 6144 + # Non-reasoning OpenAI model: the override still applies — it's keyed + # on provider, not model class. (The whole point is that the provider + # has a quirky reasoning-token accounting model.) + assert srv._budget_for_purpose("confer", "openai", "gpt-test") == 6144 + + # Non-overridden purposes for OpenAI fall through to the tier table. + # audit / synth on a NON-reasoning model -> non-reasoning ceiling. + assert srv._budget_for_purpose("audit", "openai", "gpt-test") == 768 + assert srv._budget_for_purpose("synth", "openai", "gpt-test") == 1024 + # audit / synth on a reasoning model -> reasoning-safe default. + assert srv._budget_for_purpose("audit", "openai", "gpt-5") == 2048 + + # ------------------------------------------------------------------ + # 2) Other providers are unaffected by the openai shipped override. + # ------------------------------------------------------------------ + assert srv._budget_for_purpose("confer", "anthropic", "claude-test") == 1500 # non-reasoning ceiling + assert srv._budget_for_purpose("debate", "anthropic", "claude-test") == 1500 + assert srv._budget_for_purpose("confer", "anthropic", "claude-opus-4-7") == 2048 # reasoning default + assert srv._budget_for_purpose("confer", "xai", "grok-4-latest") == 1500 + + # ------------------------------------------------------------------ + # 3) Precedence: global CFG.token_budgets beats the per-provider override + # ------------------------------------------------------------------ + srv.CFG["token_budgets"] = {"confer": 999} + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 999 + # And it still beats the provider override for OTHER models too: + assert srv._budget_for_purpose("confer", "anthropic", "claude-test") == 999 + srv.CFG.pop("token_budgets", None) + + # ------------------------------------------------------------------ + # 4) Precedence: CFG.token_budgets_by_provider beats shipped defaults + # ------------------------------------------------------------------ + srv.CFG["token_budgets_by_provider"] = { + "openai": {"confer": 8192}, # bump higher than the shipped 6144 + "anthropic": {"debate": 4096}, # add an override anthropic didn't have + } + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 8192 + # Anthropic now has a debate override even though there's no shipped one + assert srv._budget_for_purpose("debate", "anthropic", "claude-opus-4-7") == 4096 + # Non-overridden purpose on the same provider still falls through + assert srv._budget_for_purpose("audit", "openai", "gpt-5") == 2048 + # And the global CFG.token_budgets still wins over the per-provider one + srv.CFG["token_budgets"] = {"confer": 500} + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 500 + srv.CFG.pop("token_budgets", None) + srv.CFG.pop("token_budgets_by_provider", None) + + # ------------------------------------------------------------------ + # 5) Operator-supplied override of 0 / negative is ignored + # ------------------------------------------------------------------ + srv.CFG["token_budgets_by_provider"] = {"openai": {"confer": 0}} + # Falls through to the SHIPPED override (6144), not the table below it + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 6144 + srv.CFG["token_budgets_by_provider"] = {"openai": {"confer": -1}} + assert srv._budget_for_purpose("confer", "openai", "gpt-5") == 6144 + srv.CFG.pop("token_budgets_by_provider", None) + + # ------------------------------------------------------------------ + # 6) No provider / no model -> reasoning-safe defaults (unchanged) + # ------------------------------------------------------------------ + assert srv._budget_for_purpose("confer") == 2048 + assert srv._budget_for_purpose("debate") == 2048 + assert srv._budget_for_purpose("audit") == 2048 + + # ------------------------------------------------------------------ + # 7) End-to-end: the wire payload for openai confer carries 6144 + # ------------------------------------------------------------------ + srv.ENV = dict(srv.ENV) + srv.ENV["OPENAI_API_KEY"] = "stub"; srv.ENV["OPENAI_MODEL"] = "gpt-5" + srv.ENV["ANTHROPIC_API_KEY"] = "stub"; srv.ENV["ANTHROPIC_MODEL"] = "claude-opus-4-7" + srv.ALL_PROVIDERS = srv.build_providers() + + captured: list[dict] = [] + def fake_post(url, h, b, **kw): + captured.append({"url": url, "body": b}) + if "openai.com" in url: + return ({"choices": [{"message": {"content": "ok"}}], + "usage": {"prompt_tokens": 30, "completion_tokens": 10}}, 1) + return ({"content": [{"type": "text", "text": "ok"}], + "usage": {"input_tokens": 30, "output_tokens": 10}}, 1) + srv._http_post_resilient = fake_post + + import time as _t + # Caller passes a huge max_tokens (8000); the budget should clamp it to + # 6144 because openai+confer trips the shipped override. + srv._ask_one(srv.ALL_PROVIDERS["openai"], + [{"role": "user", "content": "hi"}], + deadline=_t.monotonic() + 10, max_tokens=8000, purpose="confer") + sent = captured[-1]["body"] + # gpt-5 is reasoning class -> openai uses max_completion_tokens, not max_tokens + assert sent.get("max_completion_tokens") == 6144, sent + + # Same call on Anthropic confer (no provider override) clamps to the + # reasoning default of 2048. + captured.clear() + srv._ask_one(srv.ALL_PROVIDERS["anthropic"], + [{"role": "user", "content": "hi"}], + deadline=_t.monotonic() + 10, max_tokens=8000, purpose="confer") + assert captured[-1]["body"].get("max_tokens") == 2048, captured[-1] + + print("OK: test_provider_token_budgets") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/servers/python/crosscheck_server.py b/servers/python/crosscheck_server.py index 9cec871..6f5cbee 100755 --- a/servers/python/crosscheck_server.py +++ b/servers/python/crosscheck_server.py @@ -2517,18 +2517,56 @@ def _fts_index_transcript(kind: str, payload: dict, path: str, ts_ms: int) -> No "solve": 2048, } +# Provider-specific overrides — applied AFTER the explicit CFG override but +# BEFORE the reasoning / non-reasoning fall-back. Use this when a particular +# provider's reasoning budget eats `max_completion_tokens` so aggressively +# that the standard 2048 leaves no room for the visible answer. +# +# OpenAI: gpt-5 charges reasoning tokens against `max_completion_tokens` and +# can spend 1500-3000 of them on internal thinking before the visible reply. +# At 2048 the visible reply gets MAX_TOKENS-truncated mid-sentence on multi- +# round work like confer / debate. 6144 = ~3k reasoning + ~3k visible answer +# is empirically the bottom of the "always lands cleanly" zone. +_PROVIDER_TOKEN_BUDGETS: dict[str, dict[str, int]] = { + "openai": { + "confer": 6144, + "debate": 6144, + }, +} + def _budget_for_purpose(purpose: str, provider: str | None = None, model: str | None = None) -> int | None: - """Return the configured ceiling for `purpose`. When provider+model are - supplied AND that model is NOT reasoning-class, prefers the (smaller) - non-reasoning ceiling — restoring the PR #7 cost savings for models - that don't need the bigger headroom. + """Return the configured ceiling for `purpose`. Precedence (highest to + lowest): + + 1. `CFG.token_budgets[purpose]` — global caller override + 2. `CFG.token_budgets_by_provider[provider][purpose]` — per-provider + operator override (no code changes needed) + 3. `_PROVIDER_TOKEN_BUDGETS[provider][purpose]` — shipped per-provider + overrides (e.g. openai confer/debate; see comment above the table) + 4. `_NON_REASONING_TOKEN_BUDGETS[purpose]` — non-reasoning model + 5. `_DEFAULT_TOKEN_BUDGETS[purpose]` — reasoning-safe default Caller uses min(default_from_token_cap, this_ceiling).""" custom = (CFG.get("token_budgets") or {}).get(purpose) if isinstance(custom, int) and custom > 0: return int(custom) + + if isinstance(provider, str) and provider: + # Operator-supplied per-provider override (no code changes). + cfg_by_provider = (CFG.get("token_budgets_by_provider") or {}) + op_table = cfg_by_provider.get(provider) if isinstance(cfg_by_provider, dict) else None + if isinstance(op_table, dict): + op_val = op_table.get(purpose) + if isinstance(op_val, int) and op_val > 0: + return int(op_val) + # Shipped per-provider override. + shipped = _PROVIDER_TOKEN_BUDGETS.get(provider) or {} + shipped_val = shipped.get(purpose) + if isinstance(shipped_val, int) and shipped_val > 0: + return int(shipped_val) + # Tier-aware: non-reasoning models get the smaller ceiling. When we # don't know the model (helper called without arguments) fall through # to the reasoning-safe default so we never starve a reasoning auditor.