From 6db7d8eeec0bfdd23199802868029ca8b45188ef Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Wed, 27 May 2026 22:09:20 +0200
Subject: [PATCH 1/3] ai: support keyless Gemini fallback

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../configure_judge_llms.ipynb                |  14 +--
 tests/unit_tests/test_ai_utils.py             | 105 ++++++++++++++++++
 validmind/ai/utils.py                         |  91 +++++++++++----
 validmind/datasets/llm/agent_dataset.py       |  14 ++-
 validmind/scorers/llm/deepeval/__init__.py    |   2 +-
 .../tests/data_validation/MissingValues.py    |   6 +-
 .../ragas/AnswerCorrectness.py                |   2 +-
 .../model_validation/ragas/AspectCritic.py    |   2 +-
 .../ragas/ContextEntityRecall.py              |   2 +-
 .../ragas/ContextPrecision.py                 |   2 +-
 .../ragas/ContextPrecisionWithoutReference.py |   2 +-
 .../model_validation/ragas/ContextRecall.py   |   2 +-
 .../model_validation/ragas/Faithfulness.py    |   2 +-
 .../ragas/SemanticSimilarity.py               |   2 +-
 .../sklearn/FeatureImportance.py              |   6 +-
 .../sklearn/HyperParametersTuning.py          |   4 +-
 .../sklearn/RobustnessDiagnosis.py            |   6 +-
 .../prompt_validation/ai_powered_test.py      |   3 +-
 18 files changed, 212 insertions(+), 55 deletions(-)

diff --git a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
index 3e8d27bcd..475b44c20 100644
--- a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
+++ b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
@@ -14,7 +14,7 @@
         "2. RAGAS-based tests, which depend on both the default judge LLM and the default judge embeddings model.\n",
         "3. DeepEval scorers, which depend on the default local scorer model path.\n",
         "\n",
-        "The notebook automatically selects the available provider from your environment, with OpenAI taking precedence when both OpenAI and Gemini keys are set, to match the library's default-provider logic."
+        "The notebook automatically selects the provider using the same logic as the library itself: OpenAI takes precedence when explicitly configured, Azure OpenAI is selected next when available, and Gemini is the default fallback. Gemini can be used with `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or through a keyless setup as long as Gemini can initialize successfully in your environment."
       ]
     },
     {
@@ -104,11 +104,11 @@
         "Before running this notebook, make sure you have:\n",
         "- a Python environment with the ValidMind Library and its LLM dependencies installed\n",
         "- access to a ValidMind account if you want to log results to the ValidMind Platform\n",
-        "- credentials for one supported judge provider in your environment\n",
+        "- access to one supported judge provider in your environment\n",
         "\n",
         "This notebook supports:\n",
         "- OpenAI via `OPENAI_API_KEY`, with optional `OPENAI_MODEL` and `OPENAI_EMBEDDINGS_MODEL` overrides. The current default judge model is `gpt-4.1` and the default embeddings model is `text-embedding-3-small`.\n",
-        "- Gemini via `GOOGLE_API_KEY` or `GEMINI_API_KEY`, with optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
+        "- Gemini with optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides. If `GOOGLE_API_KEY` or `GEMINI_API_KEY` is set, it will be used. Keyless Gemini setups are also supported as long as Gemini can initialize successfully in your environment. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
         "- Azure OpenAI via `AZURE_OPENAI_KEY`, `AZURE_OPENAI_ENDPOINT`, and `AZURE_OPENAI_MODEL`. The current default embeddings model is `text-embedding-3-small`.\n",
         "\n",
         "You can still run the notebook locally without connecting to the ValidMind Platform, but connecting a model document makes it easier to review and share results after the tests complete."
@@ -361,14 +361,14 @@
         "\n",
         "## Configure the judge provider\n",
         "\n",
-        "The next cells load your environment variables, resolve the judge provider from the credentials available in your session, and initialize the ValidMind Library for result logging.\n",
+        "The next cells load your environment variables, resolve the judge provider from the configuration available in your session, and initialize the ValidMind Library for result logging.\n",
         "\n",
         "This notebook uses the same provider resolution logic as the library itself:\n",
         "- OpenAI is selected when `OPENAI_API_KEY` is available, with `OPENAI_MODEL` as an optional override. The current default judge model is `gpt-4.1`.\n",
         "- Azure OpenAI is selected when Azure credentials are available, using `AZURE_OPENAI_MODEL` for the judge model.\n",
-        "- Gemini is selected when `GOOGLE_API_KEY` or `GEMINI_API_KEY` is available, with optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
+        "- Gemini is the default fallback when OpenAI and Azure are not explicitly configured. Optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides are supported. If `GOOGLE_API_KEY` or `GEMINI_API_KEY` is available, it will be used; otherwise, keyless Gemini setups are supported as long as Gemini can initialize successfully in your environment. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
         "\n",
-        "If more than one provider is configured, OpenAI takes precedence to match the library default.\n",
+        "If more than one provider is configured, OpenAI takes precedence over Azure OpenAI and Gemini, and Azure OpenAI takes precedence over Gemini, to match the library default.\n",
         "\n",
         "This matters because the same default judge configuration is reused across multiple evaluation paths, so checking it once here makes the later test results easier to interpret."
       ]
@@ -626,7 +626,7 @@
         "\n",
         "As in the RAGAS example, we create a ValidMind dataset with `vm.init_dataset()` so the scorer workflow runs against the same kind of object customers would use in their own notebooks.\n",
         "\n",
-        "These scorers do not use the judge embeddings object. For this notebook, we use two representative examples:\n",
+        "These scorers do not use the judge embeddings object. They now support both Gemini API-key setups and keyless Gemini setups through the same default model-resolution path. For this notebook, we use two representative examples:\n",
         "- `AnswerRelevancy`\n",
         "- `Hallucination`\n",
         "\n",
diff --git a/tests/unit_tests/test_ai_utils.py b/tests/unit_tests/test_ai_utils.py
index bdc857bd5..8a2c9f366 100644
--- a/tests/unit_tests/test_ai_utils.py
+++ b/tests/unit_tests/test_ai_utils.py
@@ -24,6 +24,16 @@ def test_get_client_and_model_supports_gemini_env():
     assert model == ai_utils.GEMINI_MODEL
 
 
+def test_get_client_and_model_defaults_to_gemini_without_provider_env():
+    _reset_ai_utils_state()
+
+    with mock.patch.dict(os.environ, {}, clear=True):
+        client, model = ai_utils.get_client_and_model()
+
+    assert client is None
+    assert model == ai_utils.GEMINI_MODEL
+
+
 def test_get_judge_config_builds_gemini_models():
     _reset_ai_utils_state()
 
@@ -59,6 +69,35 @@ def __init__(self, **kwargs):
     }
 
 
+def test_get_judge_config_builds_gemini_models_without_provider_env():
+    _reset_ai_utils_state()
+
+    class FakeChatGoogleGenerativeAI:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    class FakeGoogleGenerativeAIEmbeddings:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    fake_module = types.SimpleNamespace(
+        ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI,
+        GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings,
+    )
+
+    with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict(
+        sys.modules, {"langchain_google_genai": fake_module}
+    ):
+        judge_llm, judge_embeddings = ai_utils.get_judge_config()
+
+    assert isinstance(judge_llm, FakeChatGoogleGenerativeAI)
+    assert judge_llm.kwargs == {"model": ai_utils.GEMINI_MODEL}
+    assert isinstance(judge_embeddings, FakeGoogleGenerativeAIEmbeddings)
+    assert judge_embeddings.kwargs == {
+        "model": ai_utils.GEMINI_EMBEDDINGS_MODEL,
+    }
+
+
 def test_is_configured_uses_resolved_judge_model():
     _reset_ai_utils_state()
 
@@ -97,3 +136,69 @@ def __init__(self, **kwargs):
         "api_key": "test-key",
         "temperature": 0,
     }
+
+
+def test_get_deepeval_model_supports_keyless_gemini_without_provider_env():
+    _reset_ai_utils_state()
+
+    class FakeDeepEvalBaseLLM:
+        pass
+
+    class FakeChatGoogleGenerativeAI:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+        def invoke(self, prompt):
+            return types.SimpleNamespace(content=f"sync:{prompt}")
+
+        async def ainvoke(self, prompt):
+            return types.SimpleNamespace(content=f"async:{prompt}")
+
+        def with_structured_output(self, schema):
+            class StructuredModel:
+                def invoke(self, prompt):
+                    return {"schema": schema, "prompt": prompt}
+
+                async def ainvoke(self, prompt):
+                    return {"schema": schema, "prompt": prompt}
+
+            return StructuredModel()
+
+    class FakeGoogleGenerativeAIEmbeddings:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    fake_google_module = types.SimpleNamespace(
+        ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI,
+        GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings,
+    )
+    fake_deepeval_module = types.ModuleType("deepeval")
+    fake_deepeval_models_module = types.ModuleType("deepeval.models")
+    fake_deepeval_base_model = types.SimpleNamespace(DeepEvalBaseLLM=FakeDeepEvalBaseLLM)
+
+    with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict(
+        sys.modules,
+        {
+            "deepeval": fake_deepeval_module,
+            "deepeval.models": fake_deepeval_models_module,
+            "langchain_google_genai": fake_google_module,
+            "deepeval.models.base_model": fake_deepeval_base_model,
+        },
+    ):
+        model = ai_utils.get_deepeval_model()
+
+    assert isinstance(model, FakeDeepEvalBaseLLM)
+    assert model.get_model_name() == ai_utils.GEMINI_MODEL
+    assert model.generate("hello") == "sync:hello"
+    assert model.generate("hello", schema="MySchema") == {
+        "schema": "MySchema",
+        "prompt": "hello",
+    }
+
+    import asyncio
+
+    assert asyncio.run(model.a_generate("hello")) == "async:hello"
+    assert asyncio.run(model.a_generate("hello", schema="MySchema")) == {
+        "schema": "MySchema",
+        "prompt": "hello",
+    }
diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py
index e717ebe59..eff5d9d59 100644
--- a/validmind/ai/utils.py
+++ b/validmind/ai/utils.py
@@ -59,10 +59,7 @@ def _get_configured_provider():
     if os.getenv("AZURE_OPENAI_KEY"):
         return "azure"
 
-    if _get_google_api_key():
-        return "gemini"
-
-    return None
+    return "gemini"
 
 
 def get_client_and_model():
@@ -104,18 +101,12 @@ def get_client_and_model():
 
         logger.debug(f"Using Azure OpenAI {__model} for generating descriptions")
 
-    elif provider == "gemini":
+    else:
         __client = None
         __model = os.getenv("GEMINI_MODEL", GEMINI_MODEL)
 
         logger.debug(f"Using Gemini {__model} for generating descriptions")
 
-    else:
-        raise ValueError(
-            "OPENAI_API_KEY, AZURE_OPENAI_KEY, GOOGLE_API_KEY, or GEMINI_API_KEY "
-            "must be setup to use LLM features"
-        )
-
     return __client, __model
 
 
@@ -202,16 +193,18 @@ def _build_gemini_judge_config(model):
         langchain_google_genai, "GoogleGenerativeAIEmbeddings"
     )
     google_api_key = _get_google_api_key()
+    chat_kwargs = {"model": model}
+    embeddings_kwargs = {
+        "model": os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL),
+    }
+
+    if google_api_key:
+        chat_kwargs["api_key"] = google_api_key
+        embeddings_kwargs["google_api_key"] = google_api_key
 
     return (
-        ChatGoogleGenerativeAI(
-            model=model,
-            api_key=google_api_key,
-        ),
-        GoogleGenerativeAIEmbeddings(
-            model=os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL),
-            google_api_key=google_api_key,
-        ),
+        ChatGoogleGenerativeAI(**chat_kwargs),
+        GoogleGenerativeAIEmbeddings(**embeddings_kwargs),
     )
 
 
@@ -230,6 +223,60 @@ def _build_openai_judge_config(client, model):
     )
 
 
+def _import_deepeval_base_llm():
+    try:
+        deepeval_base_model = importlib.import_module("deepeval.models.base_model")
+    except ImportError:
+        raise ImportError(
+            "Please run `pip install validmind[llm]` to use Gemini DeepEval scorers"
+        )
+
+    return getattr(deepeval_base_model, "DeepEvalBaseLLM")
+
+
+def _unwrap_deepeval_response(response):
+    return getattr(response, "content", response)
+
+
+def _build_gemini_deepeval_model(model):
+    DeepEvalBaseLLM = _import_deepeval_base_llm()
+    judge_llm, _ = _build_gemini_judge_config(model)
+
+    class GeminiDeepEvalModel(DeepEvalBaseLLM):
+        def __init__(self, chat_model, model_name):
+            self._chat_model = chat_model
+            self._model_name = model_name
+            self.model = self.load_model()
+
+        def load_model(self, *args, **kwargs):
+            return self._chat_model
+
+        def generate(self, prompt: str, schema=None):
+            chat_model = self.load_model()
+            if schema is not None and hasattr(chat_model, "with_structured_output"):
+                response = chat_model.with_structured_output(schema).invoke(prompt)
+            else:
+                response = chat_model.invoke(prompt)
+
+            return _unwrap_deepeval_response(response)
+
+        async def a_generate(self, prompt: str, schema=None):
+            chat_model = self.load_model()
+            if schema is not None and hasattr(chat_model, "with_structured_output"):
+                response = await chat_model.with_structured_output(schema).ainvoke(
+                    prompt
+                )
+            else:
+                response = await chat_model.ainvoke(prompt)
+
+            return _unwrap_deepeval_response(response)
+
+        def get_model_name(self, *args, **kwargs):
+            return self._model_name
+
+    return GeminiDeepEvalModel(judge_llm, model)
+
+
 def get_judge_config(judge_llm=None, judge_embeddings=None):
     Embeddings, BaseChatModel, FunctionModel = _import_judge_dependencies()
 
@@ -270,6 +317,10 @@ def get_deepeval_model():
     _, model = get_client_and_model()
 
     if provider == "gemini":
+        google_api_key = _get_google_api_key()
+        if google_api_key is None:
+            return _build_gemini_deepeval_model(model)
+
         try:
             deepeval_models = importlib.import_module("deepeval.models")
         except ImportError:
@@ -280,7 +331,7 @@ def get_deepeval_model():
         GeminiModel = getattr(deepeval_models, "GeminiModel")
         return GeminiModel(
             model=model,
-            api_key=_get_google_api_key(),
+            api_key=google_api_key,
             temperature=0,
         )
 
diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py
index a3f05ac47..c35c753cd 100644
--- a/validmind/datasets/llm/agent_dataset.py
+++ b/validmind/datasets/llm/agent_dataset.py
@@ -413,14 +413,16 @@ def to_deepeval_test_cases(self) -> List[Any]:
 
                     test_case = LLMTestCase(
                         input=str(row["input"]),
-                        actual_output=str(row["actual_output"])
-                        if pd.notna(row["actual_output"])
-                        else "",
+                        actual_output=(
+                            str(row["actual_output"])
+                            if pd.notna(row["actual_output"])
+                            else ""
+                        ),
                         expected_output=expected_output_val,
                         context=context_val if context_val else None,
-                        retrieval_context=retrieval_context_val
-                        if retrieval_context_val
-                        else None,
+                        retrieval_context=(
+                            retrieval_context_val if retrieval_context_val else None
+                        ),
                         # Note: tools_called deserialization would need more complex logic
                         # for now we'll keep it simple
                     )
diff --git a/validmind/scorers/llm/deepeval/__init__.py b/validmind/scorers/llm/deepeval/__init__.py
index 4a1de3536..3cf84a2d7 100644
--- a/validmind/scorers/llm/deepeval/__init__.py
+++ b/validmind/scorers/llm/deepeval/__init__.py
@@ -104,7 +104,7 @@ def _extract_tool_calls_from_message(
 
 
 def extract_tool_calls_from_agent_output(
-    agent_output: Dict[str, Any]
+    agent_output: Dict[str, Any],
 ) -> List[ToolCall]:
     """Extract ToolCall objects from an agent's output.
 
diff --git a/validmind/tests/data_validation/MissingValues.py b/validmind/tests/data_validation/MissingValues.py
index 63b924f88..78532f106 100644
--- a/validmind/tests/data_validation/MissingValues.py
+++ b/validmind/tests/data_validation/MissingValues.py
@@ -62,9 +62,9 @@ def MissingValues(
                 "Column": col,
                 "Number of Missing Values": missing[col],
                 "Percentage of Missing Values (%)": missing_pct[col],
-                "Pass/Fail": "Pass"
-                if missing_pct[col] <= min_percentage_threshold
-                else "Fail",
+                "Pass/Fail": (
+                    "Pass" if missing_pct[col] <= min_percentage_threshold else "Fail"
+                ),
             }
             for col in missing.index
         ],
diff --git a/validmind/tests/model_validation/ragas/AnswerCorrectness.py b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
index cfd145920..2081bce50 100644
--- a/validmind/tests/model_validation/ragas/AnswerCorrectness.py
+++ b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
@@ -125,7 +125,7 @@ def AnswerCorrectness(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[answer_correctness()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "answer_correctness"
diff --git a/validmind/tests/model_validation/ragas/AspectCritic.py b/validmind/tests/model_validation/ragas/AspectCritic.py
index 176da2eec..f52446101 100644
--- a/validmind/tests/model_validation/ragas/AspectCritic.py
+++ b/validmind/tests/model_validation/ragas/AspectCritic.py
@@ -165,7 +165,7 @@ def AspectCritic(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=all_aspects,
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     # reverse the score for aspects where lower is better
diff --git a/validmind/tests/model_validation/ragas/ContextEntityRecall.py b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
index 756954c48..a95bed63f 100644
--- a/validmind/tests/model_validation/ragas/ContextEntityRecall.py
+++ b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
@@ -120,7 +120,7 @@ def ContextEntityRecall(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[context_entity_recall()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "context_entity_recall"
diff --git a/validmind/tests/model_validation/ragas/ContextPrecision.py b/validmind/tests/model_validation/ragas/ContextPrecision.py
index 06a355182..e6890916b 100644
--- a/validmind/tests/model_validation/ragas/ContextPrecision.py
+++ b/validmind/tests/model_validation/ragas/ContextPrecision.py
@@ -116,7 +116,7 @@ def ContextPrecision(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[context_precision()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "llm_context_precision_with_reference"
diff --git a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
index 04154d1f2..3b9d91715 100644
--- a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
+++ b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
@@ -111,7 +111,7 @@ def ContextPrecisionWithoutReference(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[context_precision()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "llm_context_precision_without_reference"
diff --git a/validmind/tests/model_validation/ragas/ContextRecall.py b/validmind/tests/model_validation/ragas/ContextRecall.py
index 17272e372..511fd7b89 100644
--- a/validmind/tests/model_validation/ragas/ContextRecall.py
+++ b/validmind/tests/model_validation/ragas/ContextRecall.py
@@ -117,7 +117,7 @@ def ContextRecall(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[context_recall()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "context_recall"
diff --git a/validmind/tests/model_validation/ragas/Faithfulness.py b/validmind/tests/model_validation/ragas/Faithfulness.py
index 0cdc8d1c1..6c25068b9 100644
--- a/validmind/tests/model_validation/ragas/Faithfulness.py
+++ b/validmind/tests/model_validation/ragas/Faithfulness.py
@@ -122,7 +122,7 @@ def Faithfulness(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[faithfulness()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "faithfulness"
diff --git a/validmind/tests/model_validation/ragas/SemanticSimilarity.py b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
index aa5c953ef..309331228 100644
--- a/validmind/tests/model_validation/ragas/SemanticSimilarity.py
+++ b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
@@ -114,7 +114,7 @@ def SemanticSimilarity(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[semantic_similarity()],
-        **get_ragas_config(judge_llm, judge_embeddings)
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "semantic_similarity"
diff --git a/validmind/tests/model_validation/sklearn/FeatureImportance.py b/validmind/tests/model_validation/sklearn/FeatureImportance.py
index 91cb32618..8f4769161 100644
--- a/validmind/tests/model_validation/sklearn/FeatureImportance.py
+++ b/validmind/tests/model_validation/sklearn/FeatureImportance.py
@@ -76,9 +76,9 @@ def FeatureImportance(
 
     for i in range(num_features):
         if i < len(top_features):
-            result[
-                f"Feature {i + 1}"
-            ] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
+            result[f"Feature {i + 1}"] = (
+                f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
+            )
         else:
             result[f"Feature {i + 1}"] = None
 
diff --git a/validmind/tests/model_validation/sklearn/HyperParametersTuning.py b/validmind/tests/model_validation/sklearn/HyperParametersTuning.py
index 502bcc270..1ecd8aa83 100644
--- a/validmind/tests/model_validation/sklearn/HyperParametersTuning.py
+++ b/validmind/tests/model_validation/sklearn/HyperParametersTuning.py
@@ -23,9 +23,7 @@ def _get_metrics(scoring):
     return (
         scoring
         if isinstance(scoring, list)
-        else list(scoring.keys())
-        if isinstance(scoring, dict)
-        else [scoring]
+        else list(scoring.keys()) if isinstance(scoring, dict) else [scoring]
     )
 
 
diff --git a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py
index bbf6d5f73..b7639583d 100644
--- a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py
+++ b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py
@@ -325,9 +325,9 @@ def RobustnessDiagnosis(
     # rename perturbation size for baseline
     # Convert to object type first to avoid dtype incompatibility warning
     results_df["Perturbation Size"] = results_df["Perturbation Size"].astype(object)
-    results_df.loc[
-        results_df["Perturbation Size"] == 0.0, "Perturbation Size"
-    ] = "Baseline (0.0)"
+    results_df.loc[results_df["Perturbation Size"] == 0.0, "Perturbation Size"] = (
+        "Baseline (0.0)"
+    )
 
     return (
         results_df,
diff --git a/validmind/tests/prompt_validation/ai_powered_test.py b/validmind/tests/prompt_validation/ai_powered_test.py
index b3ad420be..3e67ade96 100644
--- a/validmind/tests/prompt_validation/ai_powered_test.py
+++ b/validmind/tests/prompt_validation/ai_powered_test.py
@@ -34,7 +34,8 @@ def call_model(
         raise ValueError(
             "LLM is not configured. Please set an `OPENAI_API_KEY`, "
             "`AZURE_OPENAI_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` "
-            "environment variable or ensure that you are connected to the "
+            "environment variable, ensure Gemini can be initialized in your "
+            "environment, or ensure that you are connected to the "
             "ValidMind API and ValidMind AI is enabled for your account."
         )
 

From 0de95a6bc428237af769c3bee0dda092267a027f Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Thu, 28 May 2026 10:35:21 +0200
Subject: [PATCH 2/3] avoid logging deepeval scorers to Confident AI

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../configure_judge_llms.ipynb                | 23 +++++++++--------
 tests/unit_tests/test_ai_utils.py             | 25 +++++++++++++++++++
 validmind/ai/utils.py                         | 22 +++++++++++++++-
 .../scorers/llm/deepeval/AnswerRelevancy.py   |  5 ++--
 .../llm/deepeval/ArgumentCorrectness.py       |  5 ++--
 validmind/scorers/llm/deepeval/Bias.py        |  5 ++--
 .../llm/deepeval/ContextualPrecision.py       |  5 ++--
 .../scorers/llm/deepeval/ContextualRecall.py  |  5 ++--
 .../llm/deepeval/ContextualRelevancy.py       |  5 ++--
 .../scorers/llm/deepeval/Faithfulness.py      |  5 ++--
 .../scorers/llm/deepeval/Hallucination.py     |  5 ++--
 .../scorers/llm/deepeval/Summarization.py     |  5 ++--
 .../scorers/llm/deepeval/ToolCorrectness.py   |  5 ++--
 13 files changed, 78 insertions(+), 42 deletions(-)

diff --git a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
index 475b44c20..bde5200a4 100644
--- a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
+++ b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
@@ -14,7 +14,7 @@
         "2. RAGAS-based tests, which depend on both the default judge LLM and the default judge embeddings model.\n",
         "3. DeepEval scorers, which depend on the default local scorer model path.\n",
         "\n",
-        "The notebook automatically selects the provider using the same logic as the library itself: OpenAI takes precedence when explicitly configured, Azure OpenAI is selected next when available, and Gemini is the default fallback. Gemini can be used with `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or through a keyless setup as long as Gemini can initialize successfully in your environment."
+        "The notebook automatically selects the available provider from your environment, with OpenAI taking precedence when both OpenAI and Gemini keys are set, to match the library's default-provider logic."
       ]
     },
     {
@@ -104,11 +104,11 @@
         "Before running this notebook, make sure you have:\n",
         "- a Python environment with the ValidMind Library and its LLM dependencies installed\n",
         "- access to a ValidMind account if you want to log results to the ValidMind Platform\n",
-        "- access to one supported judge provider in your environment\n",
+        "- credentials for one supported judge provider in your environment\n",
         "\n",
         "This notebook supports:\n",
         "- OpenAI via `OPENAI_API_KEY`, with optional `OPENAI_MODEL` and `OPENAI_EMBEDDINGS_MODEL` overrides. The current default judge model is `gpt-4.1` and the default embeddings model is `text-embedding-3-small`.\n",
-        "- Gemini with optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides. If `GOOGLE_API_KEY` or `GEMINI_API_KEY` is set, it will be used. Keyless Gemini setups are also supported as long as Gemini can initialize successfully in your environment. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
+        "- Gemini via `GOOGLE_API_KEY` or `GEMINI_API_KEY`, with optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
         "- Azure OpenAI via `AZURE_OPENAI_KEY`, `AZURE_OPENAI_ENDPOINT`, and `AZURE_OPENAI_MODEL`. The current default embeddings model is `text-embedding-3-small`.\n",
         "\n",
         "You can still run the notebook locally without connecting to the ValidMind Platform, but connecting a model document makes it easier to review and share results after the tests complete."
@@ -158,6 +158,7 @@
     },
     {
       "cell_type": "markdown",
+      "id": "322b05cc",
       "metadata": {},
       "source": [
         "<a id='toc3__'></a>\n",
@@ -266,11 +267,11 @@
         "import validmind as vm\n",
         "\n",
         "vm.init(\n",
-        "    api_host=\"http://localhost:5000/api/v1/tracking\",\n",
-        "    api_key=\"..\",\n",
-        "    api_secret=\"..\",\n",
+        "    api_host=\"https://app.prod.validmind.ai/api/v1/tracking\",\n",
+        "    api_key=\"...\",\n",
+        "    api_secret=\"...\",\n",
         "    document=\"documentation\", # requires library >=2.12.0\n",
-        "    model=\"..\",\n",
+        "    model=\"...\",\n",
         ")"
       ]
     },
@@ -361,14 +362,14 @@
         "\n",
         "## Configure the judge provider\n",
         "\n",
-        "The next cells load your environment variables, resolve the judge provider from the configuration available in your session, and initialize the ValidMind Library for result logging.\n",
+        "The next cells load your environment variables, resolve the judge provider from the credentials available in your session, and initialize the ValidMind Library for result logging.\n",
         "\n",
         "This notebook uses the same provider resolution logic as the library itself:\n",
         "- OpenAI is selected when `OPENAI_API_KEY` is available, with `OPENAI_MODEL` as an optional override. The current default judge model is `gpt-4.1`.\n",
         "- Azure OpenAI is selected when Azure credentials are available, using `AZURE_OPENAI_MODEL` for the judge model.\n",
-        "- Gemini is the default fallback when OpenAI and Azure are not explicitly configured. Optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides are supported. If `GOOGLE_API_KEY` or `GEMINI_API_KEY` is available, it will be used; otherwise, keyless Gemini setups are supported as long as Gemini can initialize successfully in your environment. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
+        "- Gemini is selected when `GOOGLE_API_KEY` or `GEMINI_API_KEY` is available, with optional `GEMINI_MODEL` and `GEMINI_EMBEDDINGS_MODEL` overrides. The current defaults are `gemini-2.5-pro` and `models/text-embedding-004`.\n",
         "\n",
-        "If more than one provider is configured, OpenAI takes precedence over Azure OpenAI and Gemini, and Azure OpenAI takes precedence over Gemini, to match the library default.\n",
+        "If more than one provider is configured, OpenAI takes precedence to match the library default.\n",
         "\n",
         "This matters because the same default judge configuration is reused across multiple evaluation paths, so checking it once here makes the later test results easier to interpret."
       ]
@@ -626,7 +627,7 @@
         "\n",
         "As in the RAGAS example, we create a ValidMind dataset with `vm.init_dataset()` so the scorer workflow runs against the same kind of object customers would use in their own notebooks.\n",
         "\n",
-        "These scorers do not use the judge embeddings object. They now support both Gemini API-key setups and keyless Gemini setups through the same default model-resolution path. For this notebook, we use two representative examples:\n",
+        "These scorers do not use the judge embeddings object. For this notebook, we use two representative examples:\n",
         "- `AnswerRelevancy`\n",
         "- `Hallucination`\n",
         "\n",
diff --git a/tests/unit_tests/test_ai_utils.py b/tests/unit_tests/test_ai_utils.py
index 8a2c9f366..185587148 100644
--- a/tests/unit_tests/test_ai_utils.py
+++ b/tests/unit_tests/test_ai_utils.py
@@ -202,3 +202,28 @@ def __init__(self, **kwargs):
         "schema": "MySchema",
         "prompt": "hello",
     }
+
+
+def test_run_deepeval_evaluation_disables_confident_requests():
+    fake_evaluate = mock.Mock(return_value="evaluation-result")
+    fake_is_confident = mock.Mock(return_value=True)
+    fake_test_run_module = types.SimpleNamespace(is_confident=fake_is_confident)
+    fake_deepeval_module = types.ModuleType("deepeval")
+    fake_deepeval_module.evaluate = fake_evaluate
+
+    with mock.patch.dict(
+        sys.modules,
+        {
+            "deepeval": fake_deepeval_module,
+            "deepeval.test_run": fake_test_run_module,
+        },
+    ):
+        result = ai_utils.run_deepeval_evaluation(
+            test_cases=["test-case"], metrics=["metric"]
+        )
+
+    fake_evaluate.assert_called_once_with(
+        test_cases=["test-case"], metrics=["metric"]
+    )
+    assert fake_test_run_module.is_confident is fake_is_confident
+    assert result == "evaluation-result"
diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py
index eff5d9d59..345ef2d90 100644
--- a/validmind/ai/utils.py
+++ b/validmind/ai/utils.py
@@ -20,7 +20,7 @@
 OPENAI_MODEL = "gpt-4.1"
 OPENAI_EMBEDDINGS_MODEL = "text-embedding-3-small"
 GEMINI_MODEL = "gemini-2.5-pro"
-GEMINI_EMBEDDINGS_MODEL = "models/text-embedding-004"
+GEMINI_EMBEDDINGS_MODEL = "gemini-embedding-001"
 
 # can be None, True or False (ternary to represent initial state, ack and failed ack)
 __ack = None
@@ -277,6 +277,26 @@ def get_model_name(self, *args, **kwargs):
     return GeminiDeepEvalModel(judge_llm, model)
 
 
+def run_deepeval_evaluation(*, test_cases, metrics):
+    try:
+        from deepeval import evaluate
+
+        deepeval_test_run = importlib.import_module("deepeval.test_run.test_run")
+    except ImportError:
+        raise ImportError(
+            "Please run `pip install validmind[llm]` to use Gemini DeepEval scorers"
+        )
+
+    original_is_confident = deepeval_test_run.is_confident
+
+    try:
+        # ValidMind scorers should run locally without depending on Confident AI login state.
+        deepeval_test_run.is_confident = lambda: False
+        return evaluate(test_cases=test_cases, metrics=metrics)
+    finally:
+        deepeval_test_run.is_confident = original_is_confident
+
+
 def get_judge_config(judge_llm=None, judge_embeddings=None):
     Embeddings, BaseChatModel, FunctionModel = _import_judge_dependencies()
 
diff --git a/validmind/scorers/llm/deepeval/AnswerRelevancy.py b/validmind/scorers/llm/deepeval/AnswerRelevancy.py
index 4a6de7e64..0f3689bd9 100644
--- a/validmind/scorers/llm/deepeval/AnswerRelevancy.py
+++ b/validmind/scorers/llm/deepeval/AnswerRelevancy.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import AnswerRelevancyMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -83,7 +82,7 @@ def AnswerRelevancy(
             input=input,
             actual_output=actual_output,
         )
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
 
         # Extract score and reason from the metric result
         metric_data = result.test_results[0].metrics_data[0]
diff --git a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
index c453bee91..acbb7dadf 100644
--- a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
+++ b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import ArgumentCorrectnessMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -103,7 +102,7 @@ def ArgumentCorrectness(
             tools_called=actual_tools_list,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Bias.py b/validmind/scorers/llm/deepeval/Bias.py
index 533d4d862..6245d7c08 100644
--- a/validmind/scorers/llm/deepeval/Bias.py
+++ b/validmind/scorers/llm/deepeval/Bias.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import BiasMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -91,7 +90,7 @@ def Bias(
             actual_output=actual_output_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
 
         # Extract score and reason from the metric result
         metric_data = result.test_results[0].metrics_data[0]
diff --git a/validmind/scorers/llm/deepeval/ContextualPrecision.py b/validmind/scorers/llm/deepeval/ContextualPrecision.py
index 304816a4c..7e44aa0ce 100644
--- a/validmind/scorers/llm/deepeval/ContextualPrecision.py
+++ b/validmind/scorers/llm/deepeval/ContextualPrecision.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import ContextualPrecisionMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -101,7 +100,7 @@ def ContextualPrecision(
             retrieval_context=retrieval_context_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/ContextualRecall.py b/validmind/scorers/llm/deepeval/ContextualRecall.py
index ef88a3da5..80e5c2076 100644
--- a/validmind/scorers/llm/deepeval/ContextualRecall.py
+++ b/validmind/scorers/llm/deepeval/ContextualRecall.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import ContextualRecallMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -101,7 +100,7 @@ def ContextualRecall(
             retrieval_context=retrieval_context_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/ContextualRelevancy.py b/validmind/scorers/llm/deepeval/ContextualRelevancy.py
index d1c9e91f8..9f9155ff7 100644
--- a/validmind/scorers/llm/deepeval/ContextualRelevancy.py
+++ b/validmind/scorers/llm/deepeval/ContextualRelevancy.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import ContextualRelevancyMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -101,7 +100,7 @@ def ContextualRelevancy(
             retrieval_context=retrieval_context_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Faithfulness.py b/validmind/scorers/llm/deepeval/Faithfulness.py
index 555455d4c..0c3a16a4f 100644
--- a/validmind/scorers/llm/deepeval/Faithfulness.py
+++ b/validmind/scorers/llm/deepeval/Faithfulness.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import FaithfulnessMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -101,7 +100,7 @@ def Faithfulness(
             retrieval_context=retrieval_context_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Hallucination.py b/validmind/scorers/llm/deepeval/Hallucination.py
index a7c9a824e..7c92f21f7 100644
--- a/validmind/scorers/llm/deepeval/Hallucination.py
+++ b/validmind/scorers/llm/deepeval/Hallucination.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import HallucinationMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -101,7 +100,7 @@ def Hallucination(
             context=context_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Summarization.py b/validmind/scorers/llm/deepeval/Summarization.py
index 56b9ba918..738ed677e 100644
--- a/validmind/scorers/llm/deepeval/Summarization.py
+++ b/validmind/scorers/llm/deepeval/Summarization.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List, Optional
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import SummarizationMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -102,7 +101,7 @@ def Summarization(
             actual_output=actual_output_value,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/ToolCorrectness.py b/validmind/scorers/llm/deepeval/ToolCorrectness.py
index 5f7a085e8..67ca61f7f 100644
--- a/validmind/scorers/llm/deepeval/ToolCorrectness.py
+++ b/validmind/scorers/llm/deepeval/ToolCorrectness.py
@@ -5,13 +5,12 @@
 from typing import Any, Dict, List
 
 from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
 from validmind.errors import MissingDependencyError
 from validmind.tests.decorator import scorer
 from validmind.vm_models.dataset import VMDataset
 
 try:
-    from deepeval import evaluate
     from deepeval.metrics import ToolCorrectnessMetric
     from deepeval.test_case import LLMTestCase
 except ImportError as e:
@@ -108,7 +107,7 @@ def ToolCorrectness(
             tools_called=actual_tools_list,
         )
 
-        result = evaluate(test_cases=[test_case], metrics=[metric])
+        result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")

From 0c9182ac28cce5f159975316705e471a6ddb55a2 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Thu, 28 May 2026 22:28:24 +0200
Subject: [PATCH 3/3] 2.13.5

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 pyproject.toml           | 2 +-
 r/validmind/DESCRIPTION  | 2 +-
 uv.lock                  | 2 +-
 validmind/__version__.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3d7ea1163..43dfa7ec6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "validmind"
-version = "2.13.4"
+version = "2.13.5"
 description = "ValidMind Library"
 readme = "README.pypi.md"
 requires-python = ">=3.9,<3.15"
diff --git a/r/validmind/DESCRIPTION b/r/validmind/DESCRIPTION
index c8ff9c220..b1ca4c9d0 100644
--- a/r/validmind/DESCRIPTION
+++ b/r/validmind/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: validmind
 Type: Package
 Title: Interface to the 'ValidMind' Platform
-Version: 2.13.4
+Version: 2.13.5
 Authors@R: c(person("Andres", "Rodriguez", role = c("aut", "cre","cph"),
                       email = "andres@validmind.ai"))
 Maintainer: Andres Rodriguez <andres@validmind.ai>
diff --git a/uv.lock b/uv.lock
index b0efd17eb..4f0ada24f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -11361,7 +11361,7 @@ wheels = [
 
 [[package]]
 name = "validmind"
-version = "2.13.4"
+version = "2.13.5"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp", extra = ["speedups"] },
diff --git a/validmind/__version__.py b/validmind/__version__.py
index 8f57b15e9..d00ccbef0 100644
--- a/validmind/__version__.py
+++ b/validmind/__version__.py
@@ -1 +1 @@
-__version__ = "2.13.4"
+__version__ = "2.13.5"