diff --git a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
index 3e8d27bcd..bde5200a4 100644
--- a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
+++ b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb
@@ -158,6 +158,7 @@
},
{
"cell_type": "markdown",
+ "id": "322b05cc",
"metadata": {},
"source": [
"\n",
@@ -266,11 +267,11 @@
"import validmind as vm\n",
"\n",
"vm.init(\n",
- " api_host=\"http://localhost:5000/api/v1/tracking\",\n",
- " api_key=\"..\",\n",
- " api_secret=\"..\",\n",
+ " api_host=\"https://app.prod.validmind.ai/api/v1/tracking\",\n",
+ " api_key=\"...\",\n",
+ " api_secret=\"...\",\n",
" document=\"documentation\", # requires library >=2.12.0\n",
- " model=\"..\",\n",
+ " model=\"...\",\n",
")"
]
},
diff --git a/pyproject.toml b/pyproject.toml
index 3d7ea1163..43dfa7ec6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "validmind"
-version = "2.13.4"
+version = "2.13.5"
description = "ValidMind Library"
readme = "README.pypi.md"
requires-python = ">=3.9,<3.15"
diff --git a/r/validmind/DESCRIPTION b/r/validmind/DESCRIPTION
index c8ff9c220..b1ca4c9d0 100644
--- a/r/validmind/DESCRIPTION
+++ b/r/validmind/DESCRIPTION
@@ -1,7 +1,7 @@
Package: validmind
Type: Package
Title: Interface to the 'ValidMind' Platform
-Version: 2.13.4
+Version: 2.13.5
Authors@R: c(person("Andres", "Rodriguez", role = c("aut", "cre","cph"),
email = "andres@validmind.ai"))
Maintainer: Andres Rodriguez
diff --git a/tests/unit_tests/test_ai_utils.py b/tests/unit_tests/test_ai_utils.py
index bdc857bd5..185587148 100644
--- a/tests/unit_tests/test_ai_utils.py
+++ b/tests/unit_tests/test_ai_utils.py
@@ -24,6 +24,16 @@ def test_get_client_and_model_supports_gemini_env():
assert model == ai_utils.GEMINI_MODEL
+def test_get_client_and_model_defaults_to_gemini_without_provider_env():
+ _reset_ai_utils_state()
+
+ with mock.patch.dict(os.environ, {}, clear=True):
+ client, model = ai_utils.get_client_and_model()
+
+ assert client is None
+ assert model == ai_utils.GEMINI_MODEL
+
+
def test_get_judge_config_builds_gemini_models():
_reset_ai_utils_state()
@@ -59,6 +69,35 @@ def __init__(self, **kwargs):
}
+def test_get_judge_config_builds_gemini_models_without_provider_env():
+ _reset_ai_utils_state()
+
+ class FakeChatGoogleGenerativeAI:
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ class FakeGoogleGenerativeAIEmbeddings:
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ fake_module = types.SimpleNamespace(
+ ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI,
+ GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings,
+ )
+
+ with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict(
+ sys.modules, {"langchain_google_genai": fake_module}
+ ):
+ judge_llm, judge_embeddings = ai_utils.get_judge_config()
+
+ assert isinstance(judge_llm, FakeChatGoogleGenerativeAI)
+ assert judge_llm.kwargs == {"model": ai_utils.GEMINI_MODEL}
+ assert isinstance(judge_embeddings, FakeGoogleGenerativeAIEmbeddings)
+ assert judge_embeddings.kwargs == {
+ "model": ai_utils.GEMINI_EMBEDDINGS_MODEL,
+ }
+
+
def test_is_configured_uses_resolved_judge_model():
_reset_ai_utils_state()
@@ -97,3 +136,94 @@ def __init__(self, **kwargs):
"api_key": "test-key",
"temperature": 0,
}
+
+
+def test_get_deepeval_model_supports_keyless_gemini_without_provider_env():
+ _reset_ai_utils_state()
+
+ class FakeDeepEvalBaseLLM:
+ pass
+
+ class FakeChatGoogleGenerativeAI:
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ def invoke(self, prompt):
+ return types.SimpleNamespace(content=f"sync:{prompt}")
+
+ async def ainvoke(self, prompt):
+ return types.SimpleNamespace(content=f"async:{prompt}")
+
+ def with_structured_output(self, schema):
+ class StructuredModel:
+ def invoke(self, prompt):
+ return {"schema": schema, "prompt": prompt}
+
+ async def ainvoke(self, prompt):
+ return {"schema": schema, "prompt": prompt}
+
+ return StructuredModel()
+
+ class FakeGoogleGenerativeAIEmbeddings:
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ fake_google_module = types.SimpleNamespace(
+ ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI,
+ GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings,
+ )
+ fake_deepeval_module = types.ModuleType("deepeval")
+ fake_deepeval_models_module = types.ModuleType("deepeval.models")
+ fake_deepeval_base_model = types.SimpleNamespace(DeepEvalBaseLLM=FakeDeepEvalBaseLLM)
+
+ with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict(
+ sys.modules,
+ {
+ "deepeval": fake_deepeval_module,
+ "deepeval.models": fake_deepeval_models_module,
+ "langchain_google_genai": fake_google_module,
+ "deepeval.models.base_model": fake_deepeval_base_model,
+ },
+ ):
+ model = ai_utils.get_deepeval_model()
+
+ assert isinstance(model, FakeDeepEvalBaseLLM)
+ assert model.get_model_name() == ai_utils.GEMINI_MODEL
+ assert model.generate("hello") == "sync:hello"
+ assert model.generate("hello", schema="MySchema") == {
+ "schema": "MySchema",
+ "prompt": "hello",
+ }
+
+ import asyncio
+
+ assert asyncio.run(model.a_generate("hello")) == "async:hello"
+ assert asyncio.run(model.a_generate("hello", schema="MySchema")) == {
+ "schema": "MySchema",
+ "prompt": "hello",
+ }
+
+
+def test_run_deepeval_evaluation_disables_confident_requests():
+ fake_evaluate = mock.Mock(return_value="evaluation-result")
+ fake_is_confident = mock.Mock(return_value=True)
+ fake_test_run_module = types.SimpleNamespace(is_confident=fake_is_confident)
+ fake_deepeval_module = types.ModuleType("deepeval")
+ fake_deepeval_module.evaluate = fake_evaluate
+
+ with mock.patch.dict(
+ sys.modules,
+ {
+ "deepeval": fake_deepeval_module,
+ "deepeval.test_run": fake_test_run_module,
+ },
+ ):
+ result = ai_utils.run_deepeval_evaluation(
+ test_cases=["test-case"], metrics=["metric"]
+ )
+
+ fake_evaluate.assert_called_once_with(
+ test_cases=["test-case"], metrics=["metric"]
+ )
+ assert fake_test_run_module.is_confident is fake_is_confident
+ assert result == "evaluation-result"
diff --git a/uv.lock b/uv.lock
index b0efd17eb..4f0ada24f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -11361,7 +11361,7 @@ wheels = [
[[package]]
name = "validmind"
-version = "2.13.4"
+version = "2.13.5"
source = { editable = "." }
dependencies = [
{ name = "aiohttp", extra = ["speedups"] },
diff --git a/validmind/__version__.py b/validmind/__version__.py
index 8f57b15e9..d00ccbef0 100644
--- a/validmind/__version__.py
+++ b/validmind/__version__.py
@@ -1 +1 @@
-__version__ = "2.13.4"
+__version__ = "2.13.5"
diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py
index e717ebe59..345ef2d90 100644
--- a/validmind/ai/utils.py
+++ b/validmind/ai/utils.py
@@ -20,7 +20,7 @@
OPENAI_MODEL = "gpt-4.1"
OPENAI_EMBEDDINGS_MODEL = "text-embedding-3-small"
GEMINI_MODEL = "gemini-2.5-pro"
-GEMINI_EMBEDDINGS_MODEL = "models/text-embedding-004"
+GEMINI_EMBEDDINGS_MODEL = "gemini-embedding-001"
# can be None, True or False (ternary to represent initial state, ack and failed ack)
__ack = None
@@ -59,10 +59,7 @@ def _get_configured_provider():
if os.getenv("AZURE_OPENAI_KEY"):
return "azure"
- if _get_google_api_key():
- return "gemini"
-
- return None
+ return "gemini"
def get_client_and_model():
@@ -104,18 +101,12 @@ def get_client_and_model():
logger.debug(f"Using Azure OpenAI {__model} for generating descriptions")
- elif provider == "gemini":
+ else:
__client = None
__model = os.getenv("GEMINI_MODEL", GEMINI_MODEL)
logger.debug(f"Using Gemini {__model} for generating descriptions")
- else:
- raise ValueError(
- "OPENAI_API_KEY, AZURE_OPENAI_KEY, GOOGLE_API_KEY, or GEMINI_API_KEY "
- "must be setup to use LLM features"
- )
-
return __client, __model
@@ -202,16 +193,18 @@ def _build_gemini_judge_config(model):
langchain_google_genai, "GoogleGenerativeAIEmbeddings"
)
google_api_key = _get_google_api_key()
+ chat_kwargs = {"model": model}
+ embeddings_kwargs = {
+ "model": os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL),
+ }
+
+ if google_api_key:
+ chat_kwargs["api_key"] = google_api_key
+ embeddings_kwargs["google_api_key"] = google_api_key
return (
- ChatGoogleGenerativeAI(
- model=model,
- api_key=google_api_key,
- ),
- GoogleGenerativeAIEmbeddings(
- model=os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL),
- google_api_key=google_api_key,
- ),
+ ChatGoogleGenerativeAI(**chat_kwargs),
+ GoogleGenerativeAIEmbeddings(**embeddings_kwargs),
)
@@ -230,6 +223,80 @@ def _build_openai_judge_config(client, model):
)
+def _import_deepeval_base_llm():
+ try:
+ deepeval_base_model = importlib.import_module("deepeval.models.base_model")
+ except ImportError:
+ raise ImportError(
+ "Please run `pip install validmind[llm]` to use Gemini DeepEval scorers"
+ )
+
+ return getattr(deepeval_base_model, "DeepEvalBaseLLM")
+
+
+def _unwrap_deepeval_response(response):
+ return getattr(response, "content", response)
+
+
+def _build_gemini_deepeval_model(model):
+ DeepEvalBaseLLM = _import_deepeval_base_llm()
+ judge_llm, _ = _build_gemini_judge_config(model)
+
+ class GeminiDeepEvalModel(DeepEvalBaseLLM):
+ def __init__(self, chat_model, model_name):
+ self._chat_model = chat_model
+ self._model_name = model_name
+ self.model = self.load_model()
+
+ def load_model(self, *args, **kwargs):
+ return self._chat_model
+
+ def generate(self, prompt: str, schema=None):
+ chat_model = self.load_model()
+ if schema is not None and hasattr(chat_model, "with_structured_output"):
+ response = chat_model.with_structured_output(schema).invoke(prompt)
+ else:
+ response = chat_model.invoke(prompt)
+
+ return _unwrap_deepeval_response(response)
+
+ async def a_generate(self, prompt: str, schema=None):
+ chat_model = self.load_model()
+ if schema is not None and hasattr(chat_model, "with_structured_output"):
+ response = await chat_model.with_structured_output(schema).ainvoke(
+ prompt
+ )
+ else:
+ response = await chat_model.ainvoke(prompt)
+
+ return _unwrap_deepeval_response(response)
+
+ def get_model_name(self, *args, **kwargs):
+ return self._model_name
+
+ return GeminiDeepEvalModel(judge_llm, model)
+
+
+def run_deepeval_evaluation(*, test_cases, metrics):
+ try:
+ from deepeval import evaluate
+
+ deepeval_test_run = importlib.import_module("deepeval.test_run.test_run")
+ except ImportError:
+ raise ImportError(
+ "Please run `pip install validmind[llm]` to use Gemini DeepEval scorers"
+ )
+
+ original_is_confident = deepeval_test_run.is_confident
+
+ try:
+ # ValidMind scorers should run locally without depending on Confident AI login state.
+ deepeval_test_run.is_confident = lambda: False
+ return evaluate(test_cases=test_cases, metrics=metrics)
+ finally:
+ deepeval_test_run.is_confident = original_is_confident
+
+
def get_judge_config(judge_llm=None, judge_embeddings=None):
Embeddings, BaseChatModel, FunctionModel = _import_judge_dependencies()
@@ -270,6 +337,10 @@ def get_deepeval_model():
_, model = get_client_and_model()
if provider == "gemini":
+ google_api_key = _get_google_api_key()
+ if google_api_key is None:
+ return _build_gemini_deepeval_model(model)
+
try:
deepeval_models = importlib.import_module("deepeval.models")
except ImportError:
@@ -280,7 +351,7 @@ def get_deepeval_model():
GeminiModel = getattr(deepeval_models, "GeminiModel")
return GeminiModel(
model=model,
- api_key=_get_google_api_key(),
+ api_key=google_api_key,
temperature=0,
)
diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py
index a3f05ac47..c35c753cd 100644
--- a/validmind/datasets/llm/agent_dataset.py
+++ b/validmind/datasets/llm/agent_dataset.py
@@ -413,14 +413,16 @@ def to_deepeval_test_cases(self) -> List[Any]:
test_case = LLMTestCase(
input=str(row["input"]),
- actual_output=str(row["actual_output"])
- if pd.notna(row["actual_output"])
- else "",
+ actual_output=(
+ str(row["actual_output"])
+ if pd.notna(row["actual_output"])
+ else ""
+ ),
expected_output=expected_output_val,
context=context_val if context_val else None,
- retrieval_context=retrieval_context_val
- if retrieval_context_val
- else None,
+ retrieval_context=(
+ retrieval_context_val if retrieval_context_val else None
+ ),
# Note: tools_called deserialization would need more complex logic
# for now we'll keep it simple
)
diff --git a/validmind/scorers/llm/deepeval/AnswerRelevancy.py b/validmind/scorers/llm/deepeval/AnswerRelevancy.py
index 4a6de7e64..0f3689bd9 100644
--- a/validmind/scorers/llm/deepeval/AnswerRelevancy.py
+++ b/validmind/scorers/llm/deepeval/AnswerRelevancy.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -83,7 +82,7 @@ def AnswerRelevancy(
input=input,
actual_output=actual_output,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
# Extract score and reason from the metric result
metric_data = result.test_results[0].metrics_data[0]
diff --git a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
index c453bee91..acbb7dadf 100644
--- a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
+++ b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import ArgumentCorrectnessMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -103,7 +102,7 @@ def ArgumentCorrectness(
tools_called=actual_tools_list,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Bias.py b/validmind/scorers/llm/deepeval/Bias.py
index 533d4d862..6245d7c08 100644
--- a/validmind/scorers/llm/deepeval/Bias.py
+++ b/validmind/scorers/llm/deepeval/Bias.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import BiasMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -91,7 +90,7 @@ def Bias(
actual_output=actual_output_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
# Extract score and reason from the metric result
metric_data = result.test_results[0].metrics_data[0]
diff --git a/validmind/scorers/llm/deepeval/ContextualPrecision.py b/validmind/scorers/llm/deepeval/ContextualPrecision.py
index 304816a4c..7e44aa0ce 100644
--- a/validmind/scorers/llm/deepeval/ContextualPrecision.py
+++ b/validmind/scorers/llm/deepeval/ContextualPrecision.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -101,7 +100,7 @@ def ContextualPrecision(
retrieval_context=retrieval_context_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/ContextualRecall.py b/validmind/scorers/llm/deepeval/ContextualRecall.py
index ef88a3da5..80e5c2076 100644
--- a/validmind/scorers/llm/deepeval/ContextualRecall.py
+++ b/validmind/scorers/llm/deepeval/ContextualRecall.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -101,7 +100,7 @@ def ContextualRecall(
retrieval_context=retrieval_context_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/ContextualRelevancy.py b/validmind/scorers/llm/deepeval/ContextualRelevancy.py
index d1c9e91f8..9f9155ff7 100644
--- a/validmind/scorers/llm/deepeval/ContextualRelevancy.py
+++ b/validmind/scorers/llm/deepeval/ContextualRelevancy.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -101,7 +100,7 @@ def ContextualRelevancy(
retrieval_context=retrieval_context_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Faithfulness.py b/validmind/scorers/llm/deepeval/Faithfulness.py
index 555455d4c..0c3a16a4f 100644
--- a/validmind/scorers/llm/deepeval/Faithfulness.py
+++ b/validmind/scorers/llm/deepeval/Faithfulness.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -101,7 +100,7 @@ def Faithfulness(
retrieval_context=retrieval_context_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Hallucination.py b/validmind/scorers/llm/deepeval/Hallucination.py
index a7c9a824e..7c92f21f7 100644
--- a/validmind/scorers/llm/deepeval/Hallucination.py
+++ b/validmind/scorers/llm/deepeval/Hallucination.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -101,7 +100,7 @@ def Hallucination(
context=context_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/Summarization.py b/validmind/scorers/llm/deepeval/Summarization.py
index 56b9ba918..738ed677e 100644
--- a/validmind/scorers/llm/deepeval/Summarization.py
+++ b/validmind/scorers/llm/deepeval/Summarization.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List, Optional
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -102,7 +101,7 @@ def Summarization(
actual_output=actual_output_value,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/ToolCorrectness.py b/validmind/scorers/llm/deepeval/ToolCorrectness.py
index 5f7a085e8..67ca61f7f 100644
--- a/validmind/scorers/llm/deepeval/ToolCorrectness.py
+++ b/validmind/scorers/llm/deepeval/ToolCorrectness.py
@@ -5,13 +5,12 @@
from typing import Any, Dict, List
from validmind import tags, tasks
-from validmind.ai.utils import get_deepeval_model
+from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation
from validmind.errors import MissingDependencyError
from validmind.tests.decorator import scorer
from validmind.vm_models.dataset import VMDataset
try:
- from deepeval import evaluate
from deepeval.metrics import ToolCorrectnessMetric
from deepeval.test_case import LLMTestCase
except ImportError as e:
@@ -108,7 +107,7 @@ def ToolCorrectness(
tools_called=actual_tools_list,
)
- result = evaluate(test_cases=[test_case], metrics=[metric])
+ result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
diff --git a/validmind/scorers/llm/deepeval/__init__.py b/validmind/scorers/llm/deepeval/__init__.py
index 4a1de3536..3cf84a2d7 100644
--- a/validmind/scorers/llm/deepeval/__init__.py
+++ b/validmind/scorers/llm/deepeval/__init__.py
@@ -104,7 +104,7 @@ def _extract_tool_calls_from_message(
def extract_tool_calls_from_agent_output(
- agent_output: Dict[str, Any]
+ agent_output: Dict[str, Any],
) -> List[ToolCall]:
"""Extract ToolCall objects from an agent's output.
diff --git a/validmind/tests/data_validation/MissingValues.py b/validmind/tests/data_validation/MissingValues.py
index 63b924f88..78532f106 100644
--- a/validmind/tests/data_validation/MissingValues.py
+++ b/validmind/tests/data_validation/MissingValues.py
@@ -62,9 +62,9 @@ def MissingValues(
"Column": col,
"Number of Missing Values": missing[col],
"Percentage of Missing Values (%)": missing_pct[col],
- "Pass/Fail": "Pass"
- if missing_pct[col] <= min_percentage_threshold
- else "Fail",
+ "Pass/Fail": (
+ "Pass" if missing_pct[col] <= min_percentage_threshold else "Fail"
+ ),
}
for col in missing.index
],
diff --git a/validmind/tests/model_validation/ragas/AnswerCorrectness.py b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
index cfd145920..2081bce50 100644
--- a/validmind/tests/model_validation/ragas/AnswerCorrectness.py
+++ b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
@@ -125,7 +125,7 @@ def AnswerCorrectness(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[answer_correctness()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "answer_correctness"
diff --git a/validmind/tests/model_validation/ragas/AspectCritic.py b/validmind/tests/model_validation/ragas/AspectCritic.py
index 176da2eec..f52446101 100644
--- a/validmind/tests/model_validation/ragas/AspectCritic.py
+++ b/validmind/tests/model_validation/ragas/AspectCritic.py
@@ -165,7 +165,7 @@ def AspectCritic(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=all_aspects,
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
# reverse the score for aspects where lower is better
diff --git a/validmind/tests/model_validation/ragas/ContextEntityRecall.py b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
index 756954c48..a95bed63f 100644
--- a/validmind/tests/model_validation/ragas/ContextEntityRecall.py
+++ b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
@@ -120,7 +120,7 @@ def ContextEntityRecall(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[context_entity_recall()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "context_entity_recall"
diff --git a/validmind/tests/model_validation/ragas/ContextPrecision.py b/validmind/tests/model_validation/ragas/ContextPrecision.py
index 06a355182..e6890916b 100644
--- a/validmind/tests/model_validation/ragas/ContextPrecision.py
+++ b/validmind/tests/model_validation/ragas/ContextPrecision.py
@@ -116,7 +116,7 @@ def ContextPrecision(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[context_precision()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "llm_context_precision_with_reference"
diff --git a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
index 04154d1f2..3b9d91715 100644
--- a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
+++ b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
@@ -111,7 +111,7 @@ def ContextPrecisionWithoutReference(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[context_precision()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "llm_context_precision_without_reference"
diff --git a/validmind/tests/model_validation/ragas/ContextRecall.py b/validmind/tests/model_validation/ragas/ContextRecall.py
index 17272e372..511fd7b89 100644
--- a/validmind/tests/model_validation/ragas/ContextRecall.py
+++ b/validmind/tests/model_validation/ragas/ContextRecall.py
@@ -117,7 +117,7 @@ def ContextRecall(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[context_recall()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "context_recall"
diff --git a/validmind/tests/model_validation/ragas/Faithfulness.py b/validmind/tests/model_validation/ragas/Faithfulness.py
index 0cdc8d1c1..6c25068b9 100644
--- a/validmind/tests/model_validation/ragas/Faithfulness.py
+++ b/validmind/tests/model_validation/ragas/Faithfulness.py
@@ -122,7 +122,7 @@ def Faithfulness(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[faithfulness()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "faithfulness"
diff --git a/validmind/tests/model_validation/ragas/SemanticSimilarity.py b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
index aa5c953ef..309331228 100644
--- a/validmind/tests/model_validation/ragas/SemanticSimilarity.py
+++ b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
@@ -114,7 +114,7 @@ def SemanticSimilarity(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[semantic_similarity()],
- **get_ragas_config(judge_llm, judge_embeddings)
+ **get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()
score_column = "semantic_similarity"
diff --git a/validmind/tests/model_validation/sklearn/FeatureImportance.py b/validmind/tests/model_validation/sklearn/FeatureImportance.py
index 91cb32618..8f4769161 100644
--- a/validmind/tests/model_validation/sklearn/FeatureImportance.py
+++ b/validmind/tests/model_validation/sklearn/FeatureImportance.py
@@ -76,9 +76,9 @@ def FeatureImportance(
for i in range(num_features):
if i < len(top_features):
- result[
- f"Feature {i + 1}"
- ] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
+ result[f"Feature {i + 1}"] = (
+ f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
+ )
else:
result[f"Feature {i + 1}"] = None
diff --git a/validmind/tests/model_validation/sklearn/HyperParametersTuning.py b/validmind/tests/model_validation/sklearn/HyperParametersTuning.py
index 502bcc270..1ecd8aa83 100644
--- a/validmind/tests/model_validation/sklearn/HyperParametersTuning.py
+++ b/validmind/tests/model_validation/sklearn/HyperParametersTuning.py
@@ -23,9 +23,7 @@ def _get_metrics(scoring):
return (
scoring
if isinstance(scoring, list)
- else list(scoring.keys())
- if isinstance(scoring, dict)
- else [scoring]
+ else list(scoring.keys()) if isinstance(scoring, dict) else [scoring]
)
diff --git a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py
index bbf6d5f73..b7639583d 100644
--- a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py
+++ b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py
@@ -325,9 +325,9 @@ def RobustnessDiagnosis(
# rename perturbation size for baseline
# Convert to object type first to avoid dtype incompatibility warning
results_df["Perturbation Size"] = results_df["Perturbation Size"].astype(object)
- results_df.loc[
- results_df["Perturbation Size"] == 0.0, "Perturbation Size"
- ] = "Baseline (0.0)"
+ results_df.loc[results_df["Perturbation Size"] == 0.0, "Perturbation Size"] = (
+ "Baseline (0.0)"
+ )
return (
results_df,
diff --git a/validmind/tests/prompt_validation/ai_powered_test.py b/validmind/tests/prompt_validation/ai_powered_test.py
index b3ad420be..3e67ade96 100644
--- a/validmind/tests/prompt_validation/ai_powered_test.py
+++ b/validmind/tests/prompt_validation/ai_powered_test.py
@@ -34,7 +34,8 @@ def call_model(
raise ValueError(
"LLM is not configured. Please set an `OPENAI_API_KEY`, "
"`AZURE_OPENAI_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` "
- "environment variable or ensure that you are connected to the "
+ "environment variable, ensure Gemini can be initialized in your "
+ "environment, or ensure that you are connected to the "
"ValidMind API and ValidMind AI is enabled for your account."
)