another attempt at fixing google genai instructor bug

cemde · cemde · commit 13aabe7c74df · 2026-03-24T00:17:47.000+01:00
diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py
@@ -319,6 +319,38 @@ def _parse_response(self, response: Any) -> ChatResponse:
             stop_reason=stop_reason,
         )
 
+    @staticmethod
+    def _clean_schema_model(model: type) -> type:
+        """Create a subclass that strips additionalProperties from its JSON schema.
+
+        Gemini's structured output API rejects additionalProperties (which Pydantic v2
+        emits by default). This creates a thin subclass that overrides model_json_schema()
+        to remove it. The returned class preserves the original name (for instructor's
+        function naming) and isinstance() checks (it's a subclass).
+        """
+
+        class CleanSchemaModel(model):  # ty: ignore[unsupported-base]
+            @classmethod
+            def model_json_schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]:
+                schema = super().model_json_schema(*args, **kwargs)  # ty: ignore[unresolved-attribute]
+                GoogleGenAIModelAdapter._strip_additional_properties(schema)
+                return schema
+
+        CleanSchemaModel.__name__ = model.__name__
+        CleanSchemaModel.__qualname__ = model.__qualname__
+        return CleanSchemaModel
+
+    @staticmethod
+    def _strip_additional_properties(obj: Any) -> None:
+        """Recursively remove additionalProperties from a JSON schema dict."""
+        if isinstance(obj, dict):
+            obj.pop("additionalProperties", None)
+            for value in obj.values():
+                GoogleGenAIModelAdapter._strip_additional_properties(value)
+        elif isinstance(obj, list):
+            for item in obj:
+                GoogleGenAIModelAdapter._strip_additional_properties(item)
+
     def _structured_chat(
         self,
         messages: List[Dict[str, Any]],
@@ -331,9 +363,16 @@ def _structured_chat(
     ) -> "ChatResponse":
         """Use instructor for structured output with validation and retries."""
         if self._instructor_client is None:
+            import instructor
             from instructor import from_genai
 
-            self._instructor_client = from_genai(self._client)
+            # Use GENAI_STRUCTURED_OUTPUTS (native JSON schema output) instead of
+            # GENAI_TOOLS (function calling). GENAI_TOOLS has upstream instructor bugs
+            # with Gemini thinking mode: (1) AttributeError when content is None on
+            # MALFORMED_FUNCTION_CALL, and (2) AssertionError on duplicate function call
+            # parts. These are not retried by instructor (only ValidationError is).
+            # GENAI_STRUCTURED_OUTPUTS parses via completion.text, avoiding both issues.
+            self._instructor_client = from_genai(self._client, mode=instructor.Mode.GENAI_STRUCTURED_OUTPUTS)
 
         params = dict(self._default_generation_params)
         if generation_params:
@@ -358,9 +397,15 @@ def _structured_chat(
         if gen_config:
             instructor_kwargs["generation_config"] = gen_config
 
+        # Wrap the response model to strip additionalProperties from its JSON schema.
+        # Pydantic v2 emits additionalProperties by default, but Gemini's structured
+        # output API rejects it. GENAI_TOOLS handles this via map_to_genai_schema(),
+        # but GENAI_STRUCTURED_OUTPUTS passes the raw Pydantic class to the SDK.
+        clean_model = self._clean_schema_model(response_model)
+
         result = self._instructor_client.chat.completions.create(
             model=self._model_id,
-            response_model=response_model,  # ty: ignore[invalid-argument-type]
+            response_model=clean_model,  # ty: ignore[invalid-argument-type]
             messages=messages,  # ty: ignore[invalid-argument-type]
             max_retries=max_retries,
             **instructor_kwargs,
diff --git a/tests/test_interface/test_model_integration/test_live_api.py b/tests/test_interface/test_model_integration/test_live_api.py
@@ -27,6 +27,7 @@
 
 import json
 import os
+from typing import Any, Dict
 
 import pytest
 from pydantic import BaseModel, Field
@@ -41,6 +42,7 @@
 OPENAI_MODEL = "gpt-4o-mini"
 ANTHROPIC_MODEL = "claude-haiku-4-5"
 GOOGLE_MODEL = "gemini-2.0-flash"
+GOOGLE_THINKING_MODEL = "gemini-2.5-flash"
 LITELLM_MODEL = "gpt-4o-mini"
 
 
@@ -316,9 +318,8 @@ def test_structured_output(self):
     def test_structured_output_nested_model(self):
         """Nested Pydantic models work despite additionalProperties in JSON schema.
 
-        Pydantic v2 emits additionalProperties: false for nested models.
-        Gemini's native JSON schema output (GENAI_STRUCTURED_OUTPUTS) rejects this,
-        but GENAI_TOOLS (function calling) handles it correctly via schema conversion.
+        Pydantic v2 emits additionalProperties for nested models and Dict fields.
+        The adapter strips it via _clean_schema_model before the SDK sees it.
         """
         from google import genai
         from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
@@ -335,6 +336,68 @@ def test_structured_output_nested_model(self):
         assert response.structured_response.location.city.lower() == "paris"
         assert response.content is not None
 
+    @requires_google
+    def test_structured_output_with_thinking(self):
+        """Structured output works with Gemini thinking mode enabled.
+
+        GENAI_TOOLS + thinking causes upstream instructor bugs (AttributeError on
+        MALFORMED_FUNCTION_CALL, AssertionError on duplicate parts).
+        GENAI_STRUCTURED_OUTPUTS avoids these by parsing completion.text directly.
+        """
+        from google import genai
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        client = genai.Client()
+        adapter = GoogleGenAIModelAdapter(
+            client=client,
+            model_id=GOOGLE_THINKING_MODEL,
+            default_generation_params={
+                "thinking_config": {"thinking_budget": 1024},
+            },
+        )
+        response = adapter.chat(
+            [{"role": "user", "content": "What is the capital of France?"}],
+            response_model=Capital,
+        )
+
+        assert isinstance(response.structured_response, Capital)
+        assert response.structured_response.city.lower() == "paris"
+        assert response.structured_response.country.lower() == "france"
+        assert response.content is not None
+
+    @requires_google
+    def test_structured_output_dict_field_with_thinking(self):
+        """Dict[str, Any] fields + thinking mode work together.
+
+        This is the exact scenario from the ToolLLMSimulator bug: a response model
+        with Dict[str, Any] (additionalProperties) used with a thinking-enabled model.
+        """
+        from google import genai
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        from pydantic import BaseModel, Field
+
+        class ToolOutput(BaseModel):
+            text: str = Field(default="", description="Description")
+            details: Dict[str, Any] = Field(default_factory=dict, description="Structured data")
+
+        client = genai.Client()
+        adapter = GoogleGenAIModelAdapter(
+            client=client,
+            model_id=GOOGLE_THINKING_MODEL,
+            default_generation_params={
+                "thinking_config": {"thinking_budget": 1024},
+            },
+        )
+        response = adapter.chat(
+            [{"role": "user", "content": "Simulate a weather API response for Paris."}],
+            response_model=ToolOutput,
+        )
+
+        assert isinstance(response.structured_response, ToolOutput)
+        assert len(response.structured_response.text) > 0
+        assert response.content is not None
+
 
 # =============================================================================
 # LiteLLM (routes through OpenAI)
diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py
@@ -2429,15 +2429,16 @@ def __init__(self):
         # thinking_config must NOT be in generation_config
         assert "thinking_config" not in gen_config
 
-    def test_structured_chat_uses_default_genai_tools_mode(self):
-        """Instructor client uses default GENAI_TOOLS mode, not GENAI_STRUCTURED_OUTPUTS.
+    def test_structured_chat_uses_genai_structured_outputs_mode(self):
+        """Instructor client uses GENAI_STRUCTURED_OUTPUTS mode.
 
-        GENAI_STRUCTURED_OUTPUTS passes Pydantic schemas directly to Gemini's native
-        JSON schema output, which rejects additionalProperties (emitted by Pydantic v2).
-        GENAI_TOOLS uses map_to_genai_schema() which strips unsupported fields.
+        GENAI_TOOLS has upstream bugs with thinking mode (AttributeError on
+        MALFORMED_FUNCTION_CALL, AssertionError on duplicate parts).
+        GENAI_STRUCTURED_OUTPUTS parses via completion.text, avoiding both.
         """
         pytest.importorskip("google.genai")
         pytest.importorskip("instructor")
+        import instructor
         from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
 
         class MockClient:
@@ -2465,20 +2466,62 @@ def __init__(self):
             )
 
             mock_from_genai.assert_called_once()
-            # Must NOT pass mode=GENAI_STRUCTURED_OUTPUTS — only GENAI_TOOLS
-            # handles additionalProperties in Pydantic schemas correctly.
-            _, call_kwargs = mock_from_genai.call_args
-            assert "mode" not in call_kwargs
+            call_kwargs = mock_from_genai.call_args
+            assert call_kwargs.kwargs.get("mode") == instructor.Mode.GENAI_STRUCTURED_OUTPUTS
 
-    def test_structured_chat_with_dict_field_model(self):
-        """Response models with Dict[str, Any] fields (like ToolSimulatorResponse) work.
+    def test_clean_schema_model_strips_additional_properties(self):
+        """_clean_schema_model strips additionalProperties from JSON schema.
 
-        Dict fields cause Pydantic v2 to emit additionalProperties in the JSON schema.
-        This must not be passed raw to the Gemini API.
+        Pydantic v2 emits additionalProperties for Dict fields and nested models.
+        Gemini's structured output API rejects it. The clean model must strip it
+        while preserving parsing and isinstance() behavior.
         """
         pytest.importorskip("google.genai")
         from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
 
+        from typing import Any, Dict
+
+        from pydantic import BaseModel, Field
+
+        class Inner(BaseModel):
+            name: str
+
+        class Outer(BaseModel):
+            text: str = Field(default="")
+            details: Dict[str, Any] = Field(default_factory=dict)
+            inner: Inner = Field(default_factory=lambda: Inner(name=""))
+
+        # Original schema has additionalProperties
+        original_schema = Outer.model_json_schema()
+        assert any("additionalProperties" in str(v) for v in original_schema.values()) or "additionalProperties" in str(
+            original_schema.get("$defs", {})
+        )
+
+        # Cleaned model's schema does not
+        CleanOuter = GoogleGenAIModelAdapter._clean_schema_model(Outer)
+        clean_schema = CleanOuter.model_json_schema()  # ty: ignore[unresolved-attribute]
+        assert "additionalProperties" not in str(clean_schema)
+
+        # Parsing still works and returns instances of the original class
+        instance = CleanOuter.model_validate({"text": "hi", "details": {"k": "v"}, "inner": {"name": "test"}})  # ty: ignore[unresolved-attribute]
+        assert isinstance(instance, Outer)
+        assert instance.text == "hi"
+        assert instance.details == {"k": "v"}
+        assert instance.inner.name == "test"
+
+    def test_structured_chat_passes_clean_model_to_instructor(self):
+        """_structured_chat wraps the response model to strip additionalProperties."""
+        pytest.importorskip("google.genai")
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        from typing import Any, Dict
+
+        from pydantic import BaseModel, Field
+
+        class ToolOutput(BaseModel):
+            text: str = Field(default="", description="Description")
+            details: Dict[str, Any] = Field(default_factory=dict, description="Structured data")
+
         class MockClient:
             class Models:
                 def generate_content(self, model, contents, config=None):
@@ -2497,27 +2540,21 @@ def __init__(self):
         mock_instructor.chat.completions.create.return_value = mock_result
         adapter._instructor_client = mock_instructor
 
-        # Use a model with Dict[str, Any] — same pattern as ToolSimulatorResponse
-        from typing import Any, Dict
-
-        from pydantic import BaseModel, Field
-
-        class ToolOutput(BaseModel):
-            text: str = Field(default="", description="Description")
-            details: Dict[str, Any] = Field(default_factory=dict, description="Structured data")
-
-        response = adapter._structured_chat(
+        adapter._structured_chat(
             messages=[{"role": "user", "content": "Hi"}],
             response_model=ToolOutput,
             generation_params={"temperature": 0.5},
         )
 
-        # Verify the call went through with correct kwargs structure
         call_kwargs = mock_instructor.chat.completions.create.call_args.kwargs
-        assert call_kwargs["response_model"] is ToolOutput
+        passed_model = call_kwargs["response_model"]
+        # Must be a subclass (not the original) with clean schema
+        assert passed_model is not ToolOutput
+        assert issubclass(passed_model, ToolOutput)
+        assert "additionalProperties" not in str(passed_model.model_json_schema())
+        # Generation params still wrapped correctly
         gen_config = call_kwargs.get("generation_config", {})
         assert gen_config.get("temperature") == 0.5
-        assert response.structured_response is mock_result
 
 
 @pytest.mark.interface