Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@
},
{
"cell_type": "markdown",
"id": "322b05cc",
"metadata": {},
"source": [
"<a id='toc3__'></a>\n",
Expand Down Expand Up @@ -266,11 +267,11 @@
"import validmind as vm\n",
"\n",
"vm.init(\n",
" api_host=\"http://localhost:5000/api/v1/tracking\",\n",
" api_key=\"..\",\n",
" api_secret=\"..\",\n",
" api_host=\"https://app.prod.validmind.ai/api/v1/tracking\",\n",
" api_key=\"...\",\n",
" api_secret=\"...\",\n",
" document=\"documentation\", # requires library >=2.12.0\n",
" model=\"..\",\n",
" model=\"...\",\n",
")"
]
},
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "validmind"
version = "2.13.4"
version = "2.13.5"
description = "ValidMind Library"
readme = "README.pypi.md"
requires-python = ">=3.9,<3.15"
Expand Down
2 changes: 1 addition & 1 deletion r/validmind/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: validmind
Type: Package
Title: Interface to the 'ValidMind' Platform
Version: 2.13.4
Version: 2.13.5
Authors@R: c(person("Andres", "Rodriguez", role = c("aut", "cre","cph"),
email = "andres@validmind.ai"))
Maintainer: Andres Rodriguez <andres@validmind.ai>
Expand Down
130 changes: 130 additions & 0 deletions tests/unit_tests/test_ai_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ def test_get_client_and_model_supports_gemini_env():
assert model == ai_utils.GEMINI_MODEL


def test_get_client_and_model_defaults_to_gemini_without_provider_env():
_reset_ai_utils_state()

with mock.patch.dict(os.environ, {}, clear=True):
client, model = ai_utils.get_client_and_model()

assert client is None
assert model == ai_utils.GEMINI_MODEL


def test_get_judge_config_builds_gemini_models():
_reset_ai_utils_state()

Expand Down Expand Up @@ -59,6 +69,35 @@ def __init__(self, **kwargs):
}


def test_get_judge_config_builds_gemini_models_without_provider_env():
_reset_ai_utils_state()

class FakeChatGoogleGenerativeAI:
def __init__(self, **kwargs):
self.kwargs = kwargs

class FakeGoogleGenerativeAIEmbeddings:
def __init__(self, **kwargs):
self.kwargs = kwargs

fake_module = types.SimpleNamespace(
ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI,
GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings,
)

with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict(
sys.modules, {"langchain_google_genai": fake_module}
):
judge_llm, judge_embeddings = ai_utils.get_judge_config()

assert isinstance(judge_llm, FakeChatGoogleGenerativeAI)
assert judge_llm.kwargs == {"model": ai_utils.GEMINI_MODEL}
assert isinstance(judge_embeddings, FakeGoogleGenerativeAIEmbeddings)
assert judge_embeddings.kwargs == {
"model": ai_utils.GEMINI_EMBEDDINGS_MODEL,
}


def test_is_configured_uses_resolved_judge_model():
_reset_ai_utils_state()

Expand Down Expand Up @@ -97,3 +136,94 @@ def __init__(self, **kwargs):
"api_key": "test-key",
"temperature": 0,
}


def test_get_deepeval_model_supports_keyless_gemini_without_provider_env():
_reset_ai_utils_state()

class FakeDeepEvalBaseLLM:
pass

class FakeChatGoogleGenerativeAI:
def __init__(self, **kwargs):
self.kwargs = kwargs

def invoke(self, prompt):
return types.SimpleNamespace(content=f"sync:{prompt}")

async def ainvoke(self, prompt):
return types.SimpleNamespace(content=f"async:{prompt}")

def with_structured_output(self, schema):
class StructuredModel:
def invoke(self, prompt):
return {"schema": schema, "prompt": prompt}

async def ainvoke(self, prompt):
return {"schema": schema, "prompt": prompt}

return StructuredModel()

class FakeGoogleGenerativeAIEmbeddings:
def __init__(self, **kwargs):
self.kwargs = kwargs

fake_google_module = types.SimpleNamespace(
ChatGoogleGenerativeAI=FakeChatGoogleGenerativeAI,
GoogleGenerativeAIEmbeddings=FakeGoogleGenerativeAIEmbeddings,
)
fake_deepeval_module = types.ModuleType("deepeval")
fake_deepeval_models_module = types.ModuleType("deepeval.models")
fake_deepeval_base_model = types.SimpleNamespace(DeepEvalBaseLLM=FakeDeepEvalBaseLLM)

with mock.patch.dict(os.environ, {}, clear=True), mock.patch.dict(
sys.modules,
{
"deepeval": fake_deepeval_module,
"deepeval.models": fake_deepeval_models_module,
"langchain_google_genai": fake_google_module,
"deepeval.models.base_model": fake_deepeval_base_model,
},
):
model = ai_utils.get_deepeval_model()

assert isinstance(model, FakeDeepEvalBaseLLM)
assert model.get_model_name() == ai_utils.GEMINI_MODEL
assert model.generate("hello") == "sync:hello"
assert model.generate("hello", schema="MySchema") == {
"schema": "MySchema",
"prompt": "hello",
}

import asyncio

assert asyncio.run(model.a_generate("hello")) == "async:hello"
assert asyncio.run(model.a_generate("hello", schema="MySchema")) == {
"schema": "MySchema",
"prompt": "hello",
}


def test_run_deepeval_evaluation_disables_confident_requests():
fake_evaluate = mock.Mock(return_value="evaluation-result")
fake_is_confident = mock.Mock(return_value=True)
fake_test_run_module = types.SimpleNamespace(is_confident=fake_is_confident)
fake_deepeval_module = types.ModuleType("deepeval")
fake_deepeval_module.evaluate = fake_evaluate

with mock.patch.dict(
sys.modules,
{
"deepeval": fake_deepeval_module,
"deepeval.test_run": fake_test_run_module,
},
):
result = ai_utils.run_deepeval_evaluation(
test_cases=["test-case"], metrics=["metric"]
)

fake_evaluate.assert_called_once_with(
test_cases=["test-case"], metrics=["metric"]
)
assert fake_test_run_module.is_confident is fake_is_confident
assert result == "evaluation-result"
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion validmind/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.13.4"
__version__ = "2.13.5"
113 changes: 92 additions & 21 deletions validmind/ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
OPENAI_MODEL = "gpt-4.1"
OPENAI_EMBEDDINGS_MODEL = "text-embedding-3-small"
GEMINI_MODEL = "gemini-2.5-pro"
GEMINI_EMBEDDINGS_MODEL = "models/text-embedding-004"
GEMINI_EMBEDDINGS_MODEL = "gemini-embedding-001"

# can be None, True or False (ternary to represent initial state, ack and failed ack)
__ack = None
Expand Down Expand Up @@ -59,10 +59,7 @@ def _get_configured_provider():
if os.getenv("AZURE_OPENAI_KEY"):
return "azure"

if _get_google_api_key():
return "gemini"

return None
return "gemini"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a thought: What if instead of trying to auto-configure creds and config for the LLM, we just accept a LangChain client object. That way the user has full flexibility to use whatever provider and credentials that they want?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@juanmleng this is similar to what I was referring to today re: "bring your own client/judge" :)

I'd say this change is good for now (if a new version is needed soon) but we should figure out a more flexible interface so we don't have to change internal implementation of the code whenever the underlying LLM/client interface changes.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great point @johnwalz97, totally agree. Worth noting that DeepEval scorers can’t use a raw LangChain client directly, so we would need an adapter around it. So perhaps we can leave this as is for now, and in the next iteration give it a bit of thought on how to expose a cleaner client-based API as @cachafla suggested?



def get_client_and_model():
Expand Down Expand Up @@ -104,18 +101,12 @@ def get_client_and_model():

logger.debug(f"Using Azure OpenAI {__model} for generating descriptions")

elif provider == "gemini":
else:
__client = None
__model = os.getenv("GEMINI_MODEL", GEMINI_MODEL)

logger.debug(f"Using Gemini {__model} for generating descriptions")

else:
raise ValueError(
"OPENAI_API_KEY, AZURE_OPENAI_KEY, GOOGLE_API_KEY, or GEMINI_API_KEY "
"must be setup to use LLM features"
)

return __client, __model


Expand Down Expand Up @@ -202,16 +193,18 @@ def _build_gemini_judge_config(model):
langchain_google_genai, "GoogleGenerativeAIEmbeddings"
)
google_api_key = _get_google_api_key()
chat_kwargs = {"model": model}
embeddings_kwargs = {
"model": os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL),
}

if google_api_key:
chat_kwargs["api_key"] = google_api_key
embeddings_kwargs["google_api_key"] = google_api_key

return (
ChatGoogleGenerativeAI(
model=model,
api_key=google_api_key,
),
GoogleGenerativeAIEmbeddings(
model=os.getenv("GEMINI_EMBEDDINGS_MODEL", GEMINI_EMBEDDINGS_MODEL),
google_api_key=google_api_key,
),
ChatGoogleGenerativeAI(**chat_kwargs),
GoogleGenerativeAIEmbeddings(**embeddings_kwargs),
)


Expand All @@ -230,6 +223,80 @@ def _build_openai_judge_config(client, model):
)


def _import_deepeval_base_llm():
try:
deepeval_base_model = importlib.import_module("deepeval.models.base_model")
except ImportError:
raise ImportError(
"Please run `pip install validmind[llm]` to use Gemini DeepEval scorers"
)

return getattr(deepeval_base_model, "DeepEvalBaseLLM")


def _unwrap_deepeval_response(response):
return getattr(response, "content", response)


def _build_gemini_deepeval_model(model):
DeepEvalBaseLLM = _import_deepeval_base_llm()
judge_llm, _ = _build_gemini_judge_config(model)

class GeminiDeepEvalModel(DeepEvalBaseLLM):
def __init__(self, chat_model, model_name):
self._chat_model = chat_model
self._model_name = model_name
self.model = self.load_model()

def load_model(self, *args, **kwargs):
return self._chat_model

def generate(self, prompt: str, schema=None):
chat_model = self.load_model()
if schema is not None and hasattr(chat_model, "with_structured_output"):
response = chat_model.with_structured_output(schema).invoke(prompt)
else:
response = chat_model.invoke(prompt)

return _unwrap_deepeval_response(response)

async def a_generate(self, prompt: str, schema=None):
chat_model = self.load_model()
if schema is not None and hasattr(chat_model, "with_structured_output"):
response = await chat_model.with_structured_output(schema).ainvoke(
prompt
)
else:
response = await chat_model.ainvoke(prompt)

return _unwrap_deepeval_response(response)

def get_model_name(self, *args, **kwargs):
return self._model_name

return GeminiDeepEvalModel(judge_llm, model)


def run_deepeval_evaluation(*, test_cases, metrics):
try:
from deepeval import evaluate

deepeval_test_run = importlib.import_module("deepeval.test_run.test_run")
except ImportError:
raise ImportError(
"Please run `pip install validmind[llm]` to use Gemini DeepEval scorers"
)

original_is_confident = deepeval_test_run.is_confident

try:
# ValidMind scorers should run locally without depending on Confident AI login state.
deepeval_test_run.is_confident = lambda: False
return evaluate(test_cases=test_cases, metrics=metrics)
finally:
deepeval_test_run.is_confident = original_is_confident


def get_judge_config(judge_llm=None, judge_embeddings=None):
Embeddings, BaseChatModel, FunctionModel = _import_judge_dependencies()

Expand Down Expand Up @@ -270,6 +337,10 @@ def get_deepeval_model():
_, model = get_client_and_model()

if provider == "gemini":
google_api_key = _get_google_api_key()
if google_api_key is None:
return _build_gemini_deepeval_model(model)

try:
deepeval_models = importlib.import_module("deepeval.models")
except ImportError:
Expand All @@ -280,7 +351,7 @@ def get_deepeval_model():
GeminiModel = getattr(deepeval_models, "GeminiModel")
return GeminiModel(
model=model,
api_key=_get_google_api_key(),
api_key=google_api_key,
temperature=0,
)

Expand Down
14 changes: 8 additions & 6 deletions validmind/datasets/llm/agent_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,14 +413,16 @@ def to_deepeval_test_cases(self) -> List[Any]:

test_case = LLMTestCase(
input=str(row["input"]),
actual_output=str(row["actual_output"])
if pd.notna(row["actual_output"])
else "",
actual_output=(
str(row["actual_output"])
if pd.notna(row["actual_output"])
else ""
),
expected_output=expected_output_val,
context=context_val if context_val else None,
retrieval_context=retrieval_context_val
if retrieval_context_val
else None,
retrieval_context=(
retrieval_context_val if retrieval_context_val else None
),
# Note: tools_called deserialization would need more complex logic
# for now we'll keep it simple
)
Expand Down
Loading
Loading