diff --git a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb index 3e8d27bcd..bde5200a4 100644 --- a/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb +++ b/notebooks/how_to/tests/run_tests/configure_tests/configure_judge_llms.ipynb @@ -158,6 +158,7 @@ }, { "cell_type": "markdown", + "id": "322b05cc", "metadata": {}, "source": [ "\n", @@ -266,11 +267,11 @@ "import validmind as vm\n", "\n", "vm.init(\n", - " api_host=\"http://localhost:5000/api/v1/tracking\",\n", - " api_key=\"..\",\n", - " api_secret=\"..\",\n", + " api_host=\"https://app.prod.validmind.ai/api/v1/tracking\",\n", + " api_key=\"...\",\n", + " api_secret=\"...\",\n", " document=\"documentation\", # requires library >=2.12.0\n", - " model=\"..\",\n", + " model=\"...\",\n", ")" ] }, diff --git a/pyproject.toml b/pyproject.toml index 3d7ea1163..43dfa7ec6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "validmind" -version = "2.13.4" +version = "2.13.5" description = "ValidMind Library" readme = "README.pypi.md" requires-python = ">=3.9,<3.15" diff --git a/r/validmind/DESCRIPTION b/r/validmind/DESCRIPTION index c8ff9c220..b1ca4c9d0 100644 --- a/r/validmind/DESCRIPTION +++ b/r/validmind/DESCRIPTION @@ -1,7 +1,7 @@ Package: validmind Type: Package Title: Interface to the 'ValidMind' Platform -Version: 2.13.4 +Version: 2.13.5 Authors@R: c(person("Andres", "Rodriguez", role = c("aut", "cre","cph"), email = "andres@validmind.ai")) Maintainer: Andres Rodriguez diff --git a/tests/unit_tests/test_ai_utils.py b/tests/unit_tests/test_ai_utils.py index bdc857bd5..185587148 100644 --- a/tests/unit_tests/test_ai_utils.py +++ b/tests/unit_tests/test_ai_utils.py @@ -24,6 +24,16 @@ def test_get_client_and_model_supports_gemini_env(): assert model == ai_utils.GEMINI_MODEL +def test_get_client_and_model_defaults_to_gemini_without_provider_env(): + _reset_ai_utils_state() + + with mock.patch.dict(os.environ, {}, clear=True): + client, model = ai_utils.get_client_and_model() + + assert client is None + assert model == ai_utils.GEMINI_MODEL + + def test_get_judge_config_builds_gemini_models(): _reset_ai_utils_state() @@ -59,6 +69,35 @@ def __init__(self, **kwargs): } +def test_get_judge_config_builds_gemini_models_without_provider_env(): + _reset_ai_utils_state() + + class FakeChatGoogleGenerativeAI: + def __init__(self, **kwargs): + self.kwargs = kwargs + + class FakeGoogleGenerativeAIEmbeddings: + def __init__(self, **kwargs): + self.kwargs = kwargs + + fake_module = types.SimpleNamespace( + ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI, + GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings, + ) + + with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict( + sys.modules, {"langchain_google_genai": fake_module} + ): + judge_llm, judge_embeddings = ai_utils.get_judge_config() + + assert isinstance(judge_llm, FakeChatGoogleGenerativeAI) + assert judge_llm.kwargs == {"model": ai_utils.GEMINI_MODEL} + assert isinstance(judge_embeddings, FakeGoogleGenerativeAIEmbeddings) + assert judge_embeddings.kwargs == { + "model": ai_utils.GEMINI_EMBEDDINGS_MODEL, + } + + def test_is_configured_uses_resolved_judge_model(): _reset_ai_utils_state() @@ -97,3 +136,94 @@ def __init__(self, **kwargs): "api_key": "test-key", "temperature": 0, } + + +def test_get_deepeval_model_supports_keyless_gemini_without_provider_env(): + _reset_ai_utils_state() + + class FakeDeepEvalBaseLLM: + pass + + class FakeChatGoogleGenerativeAI: + def __init__(self, **kwargs): + self.kwargs = kwargs + + def invoke(self, prompt): + return types.SimpleNamespace(content=f"sync:{prompt}") + + async def ainvoke(self, prompt): + return types.SimpleNamespace(content=f"async:{prompt}") + + def with_structured_output(self, schema): + class StructuredModel: + def invoke(self, prompt): + return {"schema": schema, "prompt": prompt} + + async def ainvoke(self, prompt): + return {"schema": schema, "prompt": prompt} + + return StructuredModel() + + class FakeGoogleGenerativeAIEmbeddings: + def __init__(self, **kwargs): + self.kwargs = kwargs + + fake_google_module = types.SimpleNamespace( + ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI, + GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings, + ) + fake_deepeval_module = types.ModuleType("deepeval") + fake_deepeval_models_module = types.ModuleType("deepeval.models") + fake_deepeval_base_model = types.SimpleNamespace(DeepEvalBaseLLM=FakeDeepEvalBaseLLM) + + with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict( + sys.modules, + { + "deepeval": fake_deepeval_module, + "deepeval.models": fake_deepeval_models_module, + "langchain_google_genai": fake_google_module, + "deepeval.models.base_model": fake_deepeval_base_model, + }, + ): + model = ai_utils.get_deepeval_model() + + assert isinstance(model, FakeDeepEvalBaseLLM) + assert model.get_model_name() == ai_utils.GEMINI_MODEL + assert model.generate("hello") == "sync:hello" + assert model.generate("hello", schema="MySchema") == { + "schema": "MySchema", + "prompt": "hello", + } + + import asyncio + + assert asyncio.run(model.a_generate("hello")) == "async:hello" + assert asyncio.run(model.a_generate("hello", schema="MySchema")) == { + "schema": "MySchema", + "prompt": "hello", + } + + +def test_run_deepeval_evaluation_disables_confident_requests(): + fake_evaluate = mock.Mock(return_value="evaluation-result") + fake_is_confident = mock.Mock(return_value=True) + fake_test_run_module = types.SimpleNamespace(is_confident=fake_is_confident) + fake_deepeval_module = types.ModuleType("deepeval") + fake_deepeval_module.evaluate = fake_evaluate + + with mock.patch.dict( + sys.modules, + { + "deepeval": fake_deepeval_module, + "deepeval.test_run": fake_test_run_module, + }, + ): + result = ai_utils.run_deepeval_evaluation( + test_cases=["test-case"], metrics=["metric"] + ) + + fake_evaluate.assert_called_once_with( + test_cases=["test-case"], metrics=["metric"] + ) + assert fake_test_run_module.is_confident is fake_is_confident + assert result == "evaluation-result" diff --git a/uv.lock b/uv.lock index b0efd17eb..4f0ada24f 100644 --- a/uv.lock +++ b/uv.lock @@ -11361,7 +11361,7 @@ wheels = [ [[package]] name = "validmind" -version = "2.13.4" +version = "2.13.5" source = { editable = "." } dependencies = [ { name = "aiohttp", extra = ["speedups"] }, diff --git a/validmind/__version__.py b/validmind/__version__.py index 8f57b15e9..d00ccbef0 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.13.4" +__version__ = "2.13.5" diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py index e717ebe59..345ef2d90 100644 --- a/validmind/ai/utils.py +++ b/validmind/ai/utils.py @@ -20,7 +20,7 @@ OPENAI_MODEL = "gpt-4.1" OPENAI_EMBEDDINGS_MODEL = "text-embedding-3-small" GEMINI_MODEL = "gemini-2.5-pro" -GEMINI_EMBEDDINGS_MODEL = "models/text-embedding-004" +GEMINI_EMBEDDINGS_MODEL = "gemini-embedding-001" # can be None, True or False (ternary to represent initial state, ack and failed ack) __ack = None @@ -59,10 +59,7 @@ def _get_configured_provider(): if os.getenv("AZURE_OPENAI_KEY"): return "azure" - if _get_google_api_key(): - return "gemini" - - return None + return "gemini" def get_client_and_model(): @@ -104,18 +101,12 @@ def get_client_and_model(): logger.debug(f"Using Azure OpenAI {__model} for generating descriptions") - elif provider == "gemini": + else: __client = None __model = os.getenv("GEMINI_MODEL", GEMINI_MODEL) logger.debug(f"Using Gemini {__model} for generating descriptions") - else: - raise ValueError( - "OPENAI_API_KEY, AZURE_OPENAI_KEY, GOOGLE_API_KEY, or GEMINI_API_KEY " - "must be setup to use LLM features" - ) - return __client, __model @@ -202,16 +193,18 @@ def _build_gemini_judge_config(model): langchain_google_genai, "GoogleGenerativeAIEmbeddings" ) google_api_key = _get_google_api_key() + chat_kwargs = {"model": model} + embeddings_kwargs = { + "model": os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL), + } + + if google_api_key: + chat_kwargs["api_key"] = google_api_key + embeddings_kwargs["google_api_key"] = google_api_key return ( - ChatGoogleGenerativeAI( - model=model, - api_key=google_api_key, - ), - GoogleGenerativeAIEmbeddings( - model=os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL), - google_api_key=google_api_key, - ), + ChatGoogleGenerativeAI(**chat_kwargs), + GoogleGenerativeAIEmbeddings(**embeddings_kwargs), ) @@ -230,6 +223,80 @@ def _build_openai_judge_config(client, model): ) +def _import_deepeval_base_llm(): + try: + deepeval_base_model = importlib.import_module("deepeval.models.base_model") + except ImportError: + raise ImportError( + "Please run `pip install validmind[llm]` to use Gemini DeepEval scorers" + ) + + return getattr(deepeval_base_model, "DeepEvalBaseLLM") + + +def _unwrap_deepeval_response(response): + return getattr(response, "content", response) + + +def _build_gemini_deepeval_model(model): + DeepEvalBaseLLM = _import_deepeval_base_llm() + judge_llm, _ = _build_gemini_judge_config(model) + + class GeminiDeepEvalModel(DeepEvalBaseLLM): + def __init__(self, chat_model, model_name): + self._chat_model = chat_model + self._model_name = model_name + self.model = self.load_model() + + def load_model(self, *args, **kwargs): + return self._chat_model + + def generate(self, prompt: str, schema=None): + chat_model = self.load_model() + if schema is not None and hasattr(chat_model, "with_structured_output"): + response = chat_model.with_structured_output(schema).invoke(prompt) + else: + response = chat_model.invoke(prompt) + + return _unwrap_deepeval_response(response) + + async def a_generate(self, prompt: str, schema=None): + chat_model = self.load_model() + if schema is not None and hasattr(chat_model, "with_structured_output"): + response = await chat_model.with_structured_output(schema).ainvoke( + prompt + ) + else: + response = await chat_model.ainvoke(prompt) + + return _unwrap_deepeval_response(response) + + def get_model_name(self, *args, **kwargs): + return self._model_name + + return GeminiDeepEvalModel(judge_llm, model) + + +def run_deepeval_evaluation(*, test_cases, metrics): + try: + from deepeval import evaluate + + deepeval_test_run = importlib.import_module("deepeval.test_run.test_run") + except ImportError: + raise ImportError( + "Please run `pip install validmind[llm]` to use Gemini DeepEval scorers" + ) + + original_is_confident = deepeval_test_run.is_confident + + try: + # ValidMind scorers should run locally without depending on Confident AI login state. + deepeval_test_run.is_confident = lambda: False + return evaluate(test_cases=test_cases, metrics=metrics) + finally: + deepeval_test_run.is_confident = original_is_confident + + def get_judge_config(judge_llm=None, judge_embeddings=None): Embeddings, BaseChatModel, FunctionModel = _import_judge_dependencies() @@ -270,6 +337,10 @@ def get_deepeval_model(): _, model = get_client_and_model() if provider == "gemini": + google_api_key = _get_google_api_key() + if google_api_key is None: + return _build_gemini_deepeval_model(model) + try: deepeval_models = importlib.import_module("deepeval.models") except ImportError: @@ -280,7 +351,7 @@ def get_deepeval_model(): GeminiModel = getattr(deepeval_models, "GeminiModel") return GeminiModel( model=model, - api_key=_get_google_api_key(), + api_key=google_api_key, temperature=0, ) diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py index a3f05ac47..c35c753cd 100644 --- a/validmind/datasets/llm/agent_dataset.py +++ b/validmind/datasets/llm/agent_dataset.py @@ -413,14 +413,16 @@ def to_deepeval_test_cases(self) -> List[Any]: test_case = LLMTestCase( input=str(row["input"]), - actual_output=str(row["actual_output"]) - if pd.notna(row["actual_output"]) - else "", + actual_output=( + str(row["actual_output"]) + if pd.notna(row["actual_output"]) + else "" + ), expected_output=expected_output_val, context=context_val if context_val else None, - retrieval_context=retrieval_context_val - if retrieval_context_val - else None, + retrieval_context=( + retrieval_context_val if retrieval_context_val else None + ), # Note: tools_called deserialization would need more complex logic # for now we'll keep it simple ) diff --git a/validmind/scorers/llm/deepeval/AnswerRelevancy.py b/validmind/scorers/llm/deepeval/AnswerRelevancy.py index 4a6de7e64..0f3689bd9 100644 --- a/validmind/scorers/llm/deepeval/AnswerRelevancy.py +++ b/validmind/scorers/llm/deepeval/AnswerRelevancy.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import AnswerRelevancyMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -83,7 +82,7 @@ def AnswerRelevancy( input=input, actual_output=actual_output, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) # Extract score and reason from the metric result metric_data = result.test_results[0].metrics_data[0] diff --git a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py index c453bee91..acbb7dadf 100644 --- a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py +++ b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import ArgumentCorrectnessMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -103,7 +102,7 @@ def ArgumentCorrectness( tools_called=actual_tools_list, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/Bias.py b/validmind/scorers/llm/deepeval/Bias.py index 533d4d862..6245d7c08 100644 --- a/validmind/scorers/llm/deepeval/Bias.py +++ b/validmind/scorers/llm/deepeval/Bias.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import BiasMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -91,7 +90,7 @@ def Bias( actual_output=actual_output_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) # Extract score and reason from the metric result metric_data = result.test_results[0].metrics_data[0] diff --git a/validmind/scorers/llm/deepeval/ContextualPrecision.py b/validmind/scorers/llm/deepeval/ContextualPrecision.py index 304816a4c..7e44aa0ce 100644 --- a/validmind/scorers/llm/deepeval/ContextualPrecision.py +++ b/validmind/scorers/llm/deepeval/ContextualPrecision.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import ContextualPrecisionMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -101,7 +100,7 @@ def ContextualPrecision( retrieval_context=retrieval_context_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/ContextualRecall.py b/validmind/scorers/llm/deepeval/ContextualRecall.py index ef88a3da5..80e5c2076 100644 --- a/validmind/scorers/llm/deepeval/ContextualRecall.py +++ b/validmind/scorers/llm/deepeval/ContextualRecall.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import ContextualRecallMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -101,7 +100,7 @@ def ContextualRecall( retrieval_context=retrieval_context_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/ContextualRelevancy.py b/validmind/scorers/llm/deepeval/ContextualRelevancy.py index d1c9e91f8..9f9155ff7 100644 --- a/validmind/scorers/llm/deepeval/ContextualRelevancy.py +++ b/validmind/scorers/llm/deepeval/ContextualRelevancy.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import ContextualRelevancyMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -101,7 +100,7 @@ def ContextualRelevancy( retrieval_context=retrieval_context_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/Faithfulness.py b/validmind/scorers/llm/deepeval/Faithfulness.py index 555455d4c..0c3a16a4f 100644 --- a/validmind/scorers/llm/deepeval/Faithfulness.py +++ b/validmind/scorers/llm/deepeval/Faithfulness.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import FaithfulnessMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -101,7 +100,7 @@ def Faithfulness( retrieval_context=retrieval_context_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/Hallucination.py b/validmind/scorers/llm/deepeval/Hallucination.py index a7c9a824e..7c92f21f7 100644 --- a/validmind/scorers/llm/deepeval/Hallucination.py +++ b/validmind/scorers/llm/deepeval/Hallucination.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import HallucinationMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -101,7 +100,7 @@ def Hallucination( context=context_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/Summarization.py b/validmind/scorers/llm/deepeval/Summarization.py index 56b9ba918..738ed677e 100644 --- a/validmind/scorers/llm/deepeval/Summarization.py +++ b/validmind/scorers/llm/deepeval/Summarization.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List, Optional from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import SummarizationMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -102,7 +101,7 @@ def Summarization( actual_output=actual_output_value, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/ToolCorrectness.py b/validmind/scorers/llm/deepeval/ToolCorrectness.py index 5f7a085e8..67ca61f7f 100644 --- a/validmind/scorers/llm/deepeval/ToolCorrectness.py +++ b/validmind/scorers/llm/deepeval/ToolCorrectness.py @@ -5,13 +5,12 @@ from typing import Any, Dict, List from validmind import tags, tasks -from validmind.ai.utils import get_deepeval_model +from validmind.ai.utils import get_deepeval_model, run_deepeval_evaluation from validmind.errors import MissingDependencyError from validmind.tests.decorator import scorer from validmind.vm_models.dataset import VMDataset try: - from deepeval import evaluate from deepeval.metrics import ToolCorrectnessMetric from deepeval.test_case import LLMTestCase except ImportError as e: @@ -108,7 +107,7 @@ def ToolCorrectness( tools_called=actual_tools_list, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) + result = run_deepeval_evaluation(test_cases=[test_case], metrics=[metric]) metric_data = result.test_results[0].metrics_data[0] score = metric_data.score reason = getattr(metric_data, "reason", "No reason provided") diff --git a/validmind/scorers/llm/deepeval/__init__.py b/validmind/scorers/llm/deepeval/__init__.py index 4a1de3536..3cf84a2d7 100644 --- a/validmind/scorers/llm/deepeval/__init__.py +++ b/validmind/scorers/llm/deepeval/__init__.py @@ -104,7 +104,7 @@ def _extract_tool_calls_from_message( def extract_tool_calls_from_agent_output( - agent_output: Dict[str, Any] + agent_output: Dict[str, Any], ) -> List[ToolCall]: """Extract ToolCall objects from an agent's output. diff --git a/validmind/tests/data_validation/MissingValues.py b/validmind/tests/data_validation/MissingValues.py index 63b924f88..78532f106 100644 --- a/validmind/tests/data_validation/MissingValues.py +++ b/validmind/tests/data_validation/MissingValues.py @@ -62,9 +62,9 @@ def MissingValues( "Column": col, "Number of Missing Values": missing[col], "Percentage of Missing Values (%)": missing_pct[col], - "Pass/Fail": "Pass" - if missing_pct[col] <= min_percentage_threshold - else "Fail", + "Pass/Fail": ( + "Pass" if missing_pct[col] <= min_percentage_threshold else "Fail" + ), } for col in missing.index ], diff --git a/validmind/tests/model_validation/ragas/AnswerCorrectness.py b/validmind/tests/model_validation/ragas/AnswerCorrectness.py index cfd145920..2081bce50 100644 --- a/validmind/tests/model_validation/ragas/AnswerCorrectness.py +++ b/validmind/tests/model_validation/ragas/AnswerCorrectness.py @@ -125,7 +125,7 @@ def AnswerCorrectness( result_df = evaluate( Dataset.from_pandas(df), metrics=[answer_correctness()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "answer_correctness" diff --git a/validmind/tests/model_validation/ragas/AspectCritic.py b/validmind/tests/model_validation/ragas/AspectCritic.py index 176da2eec..f52446101 100644 --- a/validmind/tests/model_validation/ragas/AspectCritic.py +++ b/validmind/tests/model_validation/ragas/AspectCritic.py @@ -165,7 +165,7 @@ def AspectCritic( result_df = evaluate( Dataset.from_pandas(df), metrics=all_aspects, - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() # reverse the score for aspects where lower is better diff --git a/validmind/tests/model_validation/ragas/ContextEntityRecall.py b/validmind/tests/model_validation/ragas/ContextEntityRecall.py index 756954c48..a95bed63f 100644 --- a/validmind/tests/model_validation/ragas/ContextEntityRecall.py +++ b/validmind/tests/model_validation/ragas/ContextEntityRecall.py @@ -120,7 +120,7 @@ def ContextEntityRecall( result_df = evaluate( Dataset.from_pandas(df), metrics=[context_entity_recall()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "context_entity_recall" diff --git a/validmind/tests/model_validation/ragas/ContextPrecision.py b/validmind/tests/model_validation/ragas/ContextPrecision.py index 06a355182..e6890916b 100644 --- a/validmind/tests/model_validation/ragas/ContextPrecision.py +++ b/validmind/tests/model_validation/ragas/ContextPrecision.py @@ -116,7 +116,7 @@ def ContextPrecision( result_df = evaluate( Dataset.from_pandas(df), metrics=[context_precision()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "llm_context_precision_with_reference" diff --git a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py index 04154d1f2..3b9d91715 100644 --- a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +++ b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py @@ -111,7 +111,7 @@ def ContextPrecisionWithoutReference( result_df = evaluate( Dataset.from_pandas(df), metrics=[context_precision()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "llm_context_precision_without_reference" diff --git a/validmind/tests/model_validation/ragas/ContextRecall.py b/validmind/tests/model_validation/ragas/ContextRecall.py index 17272e372..511fd7b89 100644 --- a/validmind/tests/model_validation/ragas/ContextRecall.py +++ b/validmind/tests/model_validation/ragas/ContextRecall.py @@ -117,7 +117,7 @@ def ContextRecall( result_df = evaluate( Dataset.from_pandas(df), metrics=[context_recall()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "context_recall" diff --git a/validmind/tests/model_validation/ragas/Faithfulness.py b/validmind/tests/model_validation/ragas/Faithfulness.py index 0cdc8d1c1..6c25068b9 100644 --- a/validmind/tests/model_validation/ragas/Faithfulness.py +++ b/validmind/tests/model_validation/ragas/Faithfulness.py @@ -122,7 +122,7 @@ def Faithfulness( result_df = evaluate( Dataset.from_pandas(df), metrics=[faithfulness()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "faithfulness" diff --git a/validmind/tests/model_validation/ragas/SemanticSimilarity.py b/validmind/tests/model_validation/ragas/SemanticSimilarity.py index aa5c953ef..309331228 100644 --- a/validmind/tests/model_validation/ragas/SemanticSimilarity.py +++ b/validmind/tests/model_validation/ragas/SemanticSimilarity.py @@ -114,7 +114,7 @@ def SemanticSimilarity( result_df = evaluate( Dataset.from_pandas(df), metrics=[semantic_similarity()], - **get_ragas_config(judge_llm, judge_embeddings) + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "semantic_similarity" diff --git a/validmind/tests/model_validation/sklearn/FeatureImportance.py b/validmind/tests/model_validation/sklearn/FeatureImportance.py index 91cb32618..8f4769161 100644 --- a/validmind/tests/model_validation/sklearn/FeatureImportance.py +++ b/validmind/tests/model_validation/sklearn/FeatureImportance.py @@ -76,9 +76,9 @@ def FeatureImportance( for i in range(num_features): if i < len(top_features): - result[ - f"Feature {i + 1}" - ] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]" + result[f"Feature {i + 1}"] = ( + f"[{top_features[i][0]}; {top_features[i][1]:.4f}]" + ) else: result[f"Feature {i + 1}"] = None diff --git a/validmind/tests/model_validation/sklearn/HyperParametersTuning.py b/validmind/tests/model_validation/sklearn/HyperParametersTuning.py index 502bcc270..1ecd8aa83 100644 --- a/validmind/tests/model_validation/sklearn/HyperParametersTuning.py +++ b/validmind/tests/model_validation/sklearn/HyperParametersTuning.py @@ -23,9 +23,7 @@ def _get_metrics(scoring): return ( scoring if isinstance(scoring, list) - else list(scoring.keys()) - if isinstance(scoring, dict) - else [scoring] + else list(scoring.keys()) if isinstance(scoring, dict) else [scoring] ) diff --git a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py index bbf6d5f73..b7639583d 100644 --- a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +++ b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py @@ -325,9 +325,9 @@ def RobustnessDiagnosis( # rename perturbation size for baseline # Convert to object type first to avoid dtype incompatibility warning results_df["Perturbation Size"] = results_df["Perturbation Size"].astype(object) - results_df.loc[ - results_df["Perturbation Size"] == 0.0, "Perturbation Size" - ] = "Baseline (0.0)" + results_df.loc[results_df["Perturbation Size"] == 0.0, "Perturbation Size"] = ( + "Baseline (0.0)" + ) return ( results_df, diff --git a/validmind/tests/prompt_validation/ai_powered_test.py b/validmind/tests/prompt_validation/ai_powered_test.py index b3ad420be..3e67ade96 100644 --- a/validmind/tests/prompt_validation/ai_powered_test.py +++ b/validmind/tests/prompt_validation/ai_powered_test.py @@ -34,7 +34,8 @@ def call_model( raise ValueError( "LLM is not configured. Please set an `OPENAI_API_KEY`, " "`AZURE_OPENAI_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` " - "environment variable or ensure that you are connected to the " + "environment variable, ensure Gemini can be initialized in your " + "environment, or ensure that you are connected to the " "ValidMind API and ValidMind AI is enabled for your account." )