|
41 | 41 | OPENAI_MODEL = "gpt-4o-mini" |
42 | 42 | ANTHROPIC_MODEL = "claude-haiku-4-5" |
43 | 43 | GOOGLE_MODEL = "gemini-2.0-flash" |
| 44 | +GOOGLE_THINKING_MODEL = "gemini-3-flash-preview" |
44 | 45 | LITELLM_MODEL = "gpt-4o-mini" |
45 | 46 |
|
46 | 47 |
|
@@ -372,3 +373,128 @@ def test_structured_output(self): |
372 | 373 | assert response.structured_response.city.lower() == "paris" |
373 | 374 | assert response.structured_response.country.lower() == "france" |
374 | 375 | assert response.content is not None |
| 376 | + |
| 377 | + |
| 378 | +# ============================================================================= |
| 379 | +# Cross-provider parameterized tests |
| 380 | +# ============================================================================= |
| 381 | + |
| 382 | + |
| 383 | +def _make_openai_adapter(**kwargs): |
| 384 | + from openai import OpenAI |
| 385 | + from maseval.interface.inference.openai import OpenAIModelAdapter |
| 386 | + |
| 387 | + return OpenAIModelAdapter(client=OpenAI(), model_id=OPENAI_MODEL, **kwargs) |
| 388 | + |
| 389 | + |
| 390 | +def _make_anthropic_adapter(**kwargs): |
| 391 | + from anthropic import Anthropic |
| 392 | + from maseval.interface.inference.anthropic import AnthropicModelAdapter |
| 393 | + |
| 394 | + return AnthropicModelAdapter(client=Anthropic(), model_id=ANTHROPIC_MODEL, max_tokens=100, **kwargs) |
| 395 | + |
| 396 | + |
| 397 | +def _make_google_adapter(**kwargs): |
| 398 | + from google import genai |
| 399 | + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter |
| 400 | + |
| 401 | + return GoogleGenAIModelAdapter(client=genai.Client(), model_id=GOOGLE_MODEL, **kwargs) |
| 402 | + |
| 403 | + |
| 404 | +def _make_litellm_adapter(**kwargs): |
| 405 | + pytest.importorskip("litellm") |
| 406 | + from maseval.interface.inference.litellm import LiteLLMModelAdapter |
| 407 | + |
| 408 | + return LiteLLMModelAdapter(model_id=LITELLM_MODEL, **kwargs) |
| 409 | + |
| 410 | + |
| 411 | +# Each entry: (factory, env_var, max_tokens_param_name, supports_seed) |
| 412 | +_ADAPTER_CONFIGS = [ |
| 413 | + pytest.param(_make_openai_adapter, "OPENAI_API_KEY", "max_tokens", True, id="openai"), |
| 414 | + pytest.param(_make_anthropic_adapter, "ANTHROPIC_API_KEY", "max_tokens", False, id="anthropic"), |
| 415 | + pytest.param(_make_google_adapter, "GOOGLE_API_KEY", "max_output_tokens", True, id="google"), |
| 416 | + pytest.param(_make_litellm_adapter, "OPENAI_API_KEY", "max_tokens", True, id="litellm"), |
| 417 | +] |
| 418 | + |
| 419 | + |
| 420 | +class TestCrossProviderStructuredOutput: |
| 421 | + """Parameterized structured output tests across all adapters.""" |
| 422 | + |
| 423 | + @pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS) |
| 424 | + def test_structured_output_with_generation_params(self, factory, env_var, max_tok_key, supports_seed): |
| 425 | + """Structured output works with temperature and seed across all providers.""" |
| 426 | + if not os.environ.get(env_var): |
| 427 | + pytest.skip(f"{env_var} not set") |
| 428 | + adapter = factory(seed=42) if supports_seed else factory() |
| 429 | + response = adapter.chat( |
| 430 | + [{"role": "user", "content": "What is the capital of France?"}], |
| 431 | + response_model=Capital, |
| 432 | + generation_params={"temperature": 0.0, max_tok_key: 100}, |
| 433 | + ) |
| 434 | + assert isinstance(response.structured_response, Capital) |
| 435 | + assert response.structured_response.city.lower() == "paris" |
| 436 | + assert response.structured_response.country.lower() == "france" |
| 437 | + |
| 438 | + @pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS) |
| 439 | + def test_tool_call_then_structured_output(self, factory, env_var, max_tok_key, supports_seed): |
| 440 | + """Tool calling and structured output both work on the same adapter instance.""" |
| 441 | + if not os.environ.get(env_var): |
| 442 | + pytest.skip(f"{env_var} not set") |
| 443 | + adapter = factory() |
| 444 | + |
| 445 | + # Tool call |
| 446 | + tool_response = adapter.chat( |
| 447 | + [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}], |
| 448 | + tools=[WEATHER_TOOL], |
| 449 | + generation_params={max_tok_key: 100}, |
| 450 | + ) |
| 451 | + assert tool_response.tool_calls is not None |
| 452 | + assert len(tool_response.tool_calls) >= 1 |
| 453 | + assert tool_response.tool_calls[0]["function"]["name"] == "get_weather" |
| 454 | + |
| 455 | + args = json.loads(tool_response.tool_calls[0]["function"]["arguments"]) |
| 456 | + assert isinstance(args, dict) |
| 457 | + assert "city" in args |
| 458 | + |
| 459 | + # Structured output on the same adapter |
| 460 | + structured_response = adapter.chat( |
| 461 | + [{"role": "user", "content": "What is the capital of France?"}], |
| 462 | + response_model=Capital, |
| 463 | + generation_params={max_tok_key: 100}, |
| 464 | + ) |
| 465 | + assert isinstance(structured_response.structured_response, Capital) |
| 466 | + assert structured_response.structured_response.city.lower() == "paris" |
| 467 | + |
| 468 | + |
| 469 | +class TestGoogleGenAIThinking: |
| 470 | + """Google GenAI structured output with thinking mode enabled. |
| 471 | +
|
| 472 | + Validates the workaround for instructor GENAI_TOOLS bugs with thinking mode: |
| 473 | + (1) content is None on MALFORMED_FUNCTION_CALL causing AttributeError, and |
| 474 | + (2) duplicate function call parts failing an assertion. |
| 475 | + Using GENAI_STRUCTURED_OUTPUTS avoids function calling entirely. |
| 476 | + """ |
| 477 | + |
| 478 | + @requires_google |
| 479 | + def test_structured_output_with_thinking(self): |
| 480 | + """Structured output works when Gemini thinking mode is enabled.""" |
| 481 | + from google import genai |
| 482 | + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter |
| 483 | + |
| 484 | + client = genai.Client() |
| 485 | + adapter = GoogleGenAIModelAdapter( |
| 486 | + client=client, |
| 487 | + model_id=GOOGLE_THINKING_MODEL, |
| 488 | + default_generation_params={ |
| 489 | + "thinking_config": {"thinking_budget": 1024}, |
| 490 | + }, |
| 491 | + ) |
| 492 | + response = adapter.chat( |
| 493 | + [{"role": "user", "content": "What is the capital of France?"}], |
| 494 | + response_model=Capital, |
| 495 | + ) |
| 496 | + |
| 497 | + assert isinstance(response.structured_response, Capital) |
| 498 | + assert response.structured_response.city.lower() == "paris" |
| 499 | + assert response.structured_response.country.lower() == "france" |
| 500 | + assert response.content is not None |
0 commit comments