Skip to content

Commit be8d387

Browse files
authored
Fixed Response Model Bug (#51)
* fixed bug in model adapter * added test
1 parent 8641a10 commit be8d387

3 files changed

Lines changed: 207 additions & 1 deletion

File tree

maseval/interface/inference/google_genai.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,15 @@ def _structured_chat(
331331
) -> "ChatResponse":
332332
"""Use instructor for structured output with validation and retries."""
333333
if self._instructor_client is None:
334+
import instructor
334335
from instructor import from_genai
335336

336-
self._instructor_client = from_genai(self._client)
337+
# Use GENAI_STRUCTURED_OUTPUTS (native JSON schema output) instead of
338+
# GENAI_TOOLS (function calling). GENAI_TOOLS triggers instructor bugs when
339+
# Gemini thinking mode is enabled: (1) content is None on MALFORMED_FUNCTION_CALL
340+
# causing an AttributeError in parse_genai_tools, and (2) duplicate function call
341+
# parts fail an assertion. GENAI_STRUCTURED_OUTPUTS avoids function calling entirely.
342+
self._instructor_client = from_genai(self._client, mode=instructor.Mode.GENAI_STRUCTURED_OUTPUTS)
337343

338344
params = dict(self._default_generation_params)
339345
if generation_params:

tests/test_interface/test_model_integration/test_live_api.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
OPENAI_MODEL = "gpt-4o-mini"
4242
ANTHROPIC_MODEL = "claude-haiku-4-5"
4343
GOOGLE_MODEL = "gemini-2.0-flash"
44+
GOOGLE_THINKING_MODEL = "gemini-3-flash-preview"
4445
LITELLM_MODEL = "gpt-4o-mini"
4546

4647

@@ -372,3 +373,128 @@ def test_structured_output(self):
372373
assert response.structured_response.city.lower() == "paris"
373374
assert response.structured_response.country.lower() == "france"
374375
assert response.content is not None
376+
377+
378+
# =============================================================================
379+
# Cross-provider parameterized tests
380+
# =============================================================================
381+
382+
383+
def _make_openai_adapter(**kwargs):
384+
from openai import OpenAI
385+
from maseval.interface.inference.openai import OpenAIModelAdapter
386+
387+
return OpenAIModelAdapter(client=OpenAI(), model_id=OPENAI_MODEL, **kwargs)
388+
389+
390+
def _make_anthropic_adapter(**kwargs):
391+
from anthropic import Anthropic
392+
from maseval.interface.inference.anthropic import AnthropicModelAdapter
393+
394+
return AnthropicModelAdapter(client=Anthropic(), model_id=ANTHROPIC_MODEL, max_tokens=100, **kwargs)
395+
396+
397+
def _make_google_adapter(**kwargs):
398+
from google import genai
399+
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
400+
401+
return GoogleGenAIModelAdapter(client=genai.Client(), model_id=GOOGLE_MODEL, **kwargs)
402+
403+
404+
def _make_litellm_adapter(**kwargs):
405+
pytest.importorskip("litellm")
406+
from maseval.interface.inference.litellm import LiteLLMModelAdapter
407+
408+
return LiteLLMModelAdapter(model_id=LITELLM_MODEL, **kwargs)
409+
410+
411+
# Each entry: (factory, env_var, max_tokens_param_name, supports_seed)
412+
_ADAPTER_CONFIGS = [
413+
pytest.param(_make_openai_adapter, "OPENAI_API_KEY", "max_tokens", True, id="openai"),
414+
pytest.param(_make_anthropic_adapter, "ANTHROPIC_API_KEY", "max_tokens", False, id="anthropic"),
415+
pytest.param(_make_google_adapter, "GOOGLE_API_KEY", "max_output_tokens", True, id="google"),
416+
pytest.param(_make_litellm_adapter, "OPENAI_API_KEY", "max_tokens", True, id="litellm"),
417+
]
418+
419+
420+
class TestCrossProviderStructuredOutput:
421+
"""Parameterized structured output tests across all adapters."""
422+
423+
@pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS)
424+
def test_structured_output_with_generation_params(self, factory, env_var, max_tok_key, supports_seed):
425+
"""Structured output works with temperature and seed across all providers."""
426+
if not os.environ.get(env_var):
427+
pytest.skip(f"{env_var} not set")
428+
adapter = factory(seed=42) if supports_seed else factory()
429+
response = adapter.chat(
430+
[{"role": "user", "content": "What is the capital of France?"}],
431+
response_model=Capital,
432+
generation_params={"temperature": 0.0, max_tok_key: 100},
433+
)
434+
assert isinstance(response.structured_response, Capital)
435+
assert response.structured_response.city.lower() == "paris"
436+
assert response.structured_response.country.lower() == "france"
437+
438+
@pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS)
439+
def test_tool_call_then_structured_output(self, factory, env_var, max_tok_key, supports_seed):
440+
"""Tool calling and structured output both work on the same adapter instance."""
441+
if not os.environ.get(env_var):
442+
pytest.skip(f"{env_var} not set")
443+
adapter = factory()
444+
445+
# Tool call
446+
tool_response = adapter.chat(
447+
[{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}],
448+
tools=[WEATHER_TOOL],
449+
generation_params={max_tok_key: 100},
450+
)
451+
assert tool_response.tool_calls is not None
452+
assert len(tool_response.tool_calls) >= 1
453+
assert tool_response.tool_calls[0]["function"]["name"] == "get_weather"
454+
455+
args = json.loads(tool_response.tool_calls[0]["function"]["arguments"])
456+
assert isinstance(args, dict)
457+
assert "city" in args
458+
459+
# Structured output on the same adapter
460+
structured_response = adapter.chat(
461+
[{"role": "user", "content": "What is the capital of France?"}],
462+
response_model=Capital,
463+
generation_params={max_tok_key: 100},
464+
)
465+
assert isinstance(structured_response.structured_response, Capital)
466+
assert structured_response.structured_response.city.lower() == "paris"
467+
468+
469+
class TestGoogleGenAIThinking:
470+
"""Google GenAI structured output with thinking mode enabled.
471+
472+
Validates the workaround for instructor GENAI_TOOLS bugs with thinking mode:
473+
(1) content is None on MALFORMED_FUNCTION_CALL causing AttributeError, and
474+
(2) duplicate function call parts failing an assertion.
475+
Using GENAI_STRUCTURED_OUTPUTS avoids function calling entirely.
476+
"""
477+
478+
@requires_google
479+
def test_structured_output_with_thinking(self):
480+
"""Structured output works when Gemini thinking mode is enabled."""
481+
from google import genai
482+
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
483+
484+
client = genai.Client()
485+
adapter = GoogleGenAIModelAdapter(
486+
client=client,
487+
model_id=GOOGLE_THINKING_MODEL,
488+
default_generation_params={
489+
"thinking_config": {"thinking_budget": 1024},
490+
},
491+
)
492+
response = adapter.chat(
493+
[{"role": "user", "content": "What is the capital of France?"}],
494+
response_model=Capital,
495+
)
496+
497+
assert isinstance(response.structured_response, Capital)
498+
assert response.structured_response.city.lower() == "paris"
499+
assert response.structured_response.country.lower() == "france"
500+
assert response.content is not None

tests/test_interface/test_model_integration/test_model_adapters.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2390,6 +2390,80 @@ def __init__(self):
23902390
gen_config = call_kwargs.kwargs.get("generation_config", {})
23912391
assert gen_config.get("seed") == 99
23922392

2393+
def test_structured_chat_separates_instructor_top_level_keys(self):
2394+
"""thinking_config stays top-level, generation params go into generation_config."""
2395+
pytest.importorskip("google.genai")
2396+
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
2397+
2398+
class MockClient:
2399+
class Models:
2400+
def generate_content(self, model, contents, config=None):
2401+
class Response:
2402+
text = "ok"
2403+
2404+
return Response()
2405+
2406+
def __init__(self):
2407+
self.models = self.Models()
2408+
2409+
adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro", seed=42)
2410+
2411+
mock_result = _make_mock_instructor_result()
2412+
mock_instructor = MagicMock()
2413+
mock_instructor.chat.completions.create.return_value = mock_result
2414+
adapter._instructor_client = mock_instructor
2415+
2416+
adapter._structured_chat(
2417+
messages=[{"role": "user", "content": "Hi"}],
2418+
response_model=object,
2419+
generation_params={"temperature": 0.5, "thinking_config": {"thinking_budget": 1024}},
2420+
)
2421+
2422+
call_kwargs = mock_instructor.chat.completions.create.call_args.kwargs
2423+
# thinking_config must be top-level (instructor pops it from kwargs directly)
2424+
assert call_kwargs.get("thinking_config") == {"thinking_budget": 1024}
2425+
# generation params must be nested inside generation_config
2426+
gen_config = call_kwargs.get("generation_config", {})
2427+
assert gen_config.get("temperature") == 0.5
2428+
assert gen_config.get("seed") == 42
2429+
# thinking_config must NOT be in generation_config
2430+
assert "thinking_config" not in gen_config
2431+
2432+
def test_structured_chat_uses_structured_outputs_mode(self):
2433+
"""Instructor client is created with GENAI_STRUCTURED_OUTPUTS mode."""
2434+
pytest.importorskip("google.genai")
2435+
pytest.importorskip("instructor")
2436+
import instructor
2437+
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
2438+
2439+
class MockClient:
2440+
class Models:
2441+
def generate_content(self, model, contents, config=None):
2442+
class Response:
2443+
text = "ok"
2444+
2445+
return Response()
2446+
2447+
def __init__(self):
2448+
self.models = self.Models()
2449+
2450+
adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro")
2451+
assert adapter._instructor_client is None
2452+
2453+
with patch("instructor.from_genai") as mock_from_genai:
2454+
mock_instructor = MagicMock()
2455+
mock_instructor.chat.completions.create.return_value = _make_mock_instructor_result()
2456+
mock_from_genai.return_value = mock_instructor
2457+
2458+
adapter._structured_chat(
2459+
messages=[{"role": "user", "content": "Hi"}],
2460+
response_model=object,
2461+
)
2462+
2463+
mock_from_genai.assert_called_once()
2464+
call_kwargs = mock_from_genai.call_args
2465+
assert call_kwargs.kwargs.get("mode") == instructor.Mode.GENAI_STRUCTURED_OUTPUTS
2466+
23932467

23942468
@pytest.mark.interface
23952469
class TestLiteLLMStructuredChat:

0 commit comments

Comments
 (0)