From c3d70f786607f90dae69f6917ce733de3aa8b1c5 Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Wed, 18 Jun 2025 04:23:02 -0300 Subject: [PATCH 1/6] add gemini model and sample asset --- .../en/QA/bd/MultiNativQA_Gemini_ZeroShot.py | 66 +++++++ llmebench/models/Gemini.py | 176 ++++++++++++++++++ llmebench/models/__init__.py | 1 + 3 files changed, 243 insertions(+) create mode 100644 assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py create mode 100644 llmebench/models/Gemini.py diff --git a/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py b/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py new file mode 100644 index 00000000..20d03eb8 --- /dev/null +++ b/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py @@ -0,0 +1,66 @@ +import json +import re + +from llmebench.datasets import MultiNativQADataset +from llmebench.models import Gemini +from llmebench.tasks import MultiNativQATask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama 3 8b", + "description": "Deployed on Azure.", + "scores": {}, + } + + +def config(): + return { + "dataset": MultiNativQADataset, + "task": MultiNativQATask, + "model": Gemini, + "general_args": {"test_split": "english_bd"}, + } + + +def prompt(input_sample): + # Define the question prompt + question_prompt = f""" + Please use your expertise to answer the following English question. Answer in English and rate your confidence level from 1 to 10. + Provide your response in the following JSON format: {{"answer": "your answer", "score": your confidence score}}. + Please provide JSON output only. No additional text. Answer should be limited to less or equal to {input_sample['length']} words. + + Question: {input_sample['question']} + """ + + # Define the assistant prompt + assistant_prompt = """ + You are an English AI assistant specialized in providing detailed and accurate answers across various fields. + Your task is to deliver clear, concise, and relevant information. + """ + + return [ + { + "role": "assistant", + "content": assistant_prompt, + }, + { + "role": "user", + "content": question_prompt, + }, + ] + + +def post_process(response): + content = response[0]["content"]["parts"][0]["text"] + content = content.replace("\n", "").strip() + if "```json" in content: + # content = content.replace("```json", "").replace('```', '').replace("\n}", "}") + # content = content.replace("{\n", "{").replace("\",\n", "\",") + + content = re.search(r"```json(.*)```", content).group(1) + return json.loads(content)["answer"] + # response = json.loads(data) + # answer = response["answer"] + return answer diff --git a/llmebench/models/Gemini.py b/llmebench/models/Gemini.py new file mode 100644 index 00000000..83699c18 --- /dev/null +++ b/llmebench/models/Gemini.py @@ -0,0 +1,176 @@ +import json +import logging +import os + +import requests + +import vertexai +import vertexai.preview.generative_models as generative_models +from vertexai.generative_models import FinishReason, GenerativeModel, Part + +from llmebench.models.model_base import ModelBase + + +class GeminiFailure(Exception): + """Exception class to map various failure types from the Gemini server""" + + def __init__(self, failure_type, failure_message): + self.type_mapping = { + "processing": "Model Inference failure", + "connection": "Failed to connect to Google Server", + } + self.type = failure_type + self.failure_message = failure_message + + def __str__(self): + return ( + f"{self.type_mapping.get(self.type, self.type)}: \n {self.failure_message}" + ) + + +class Gemini(ModelBase): + """ + Gemini Model interface. + + Arguments + --------- + project_id : str + Google Project ID. If not provided, the implementation will + look at environment variable `GOOGLE_PROJECT_ID` + api_key : str + Authentication token for the API. If not provided, the implementation will derive it + from environment variables `OPENAI_API_KEY` or `AZURE_API_KEY`. + timeout : int + Number of seconds before the request to the server is timed out + temperature : float + Temperature value to use for the model. Defaults to zero for reproducibility. + top_p : float + Top P value to use for the model. Defaults to 0.95 + max_tokens : int + Maximum number of tokens to pass to the model. Defaults to 1512 + """ + + def __init__( + self, + project_id=None, + api_key=None, + model_name=None, + timeout=20, + temperature=0, + top_p=0.95, + max_tokens=2000, + **kwargs, + ): + # API parameters + # self.api_url = api_url or os.getenv("AZURE_DEPLOYMENT_API_URL") + self.api_key = api_key or os.getenv("GOOGLE_API_KEY") + self.project_id = project_id or os.getenv("GOOGLE_PROJECT_ID") + self.model_name = model_name or os.getenv("MODEL") + if self.api_key is None: + raise Exception( + "API Key must be provided as model config or environment variable (`GOOGLE_API_KEY`)" + ) + if self.project_id is None: + raise Exception( + "PROJECT_ID must be provided as model config or environment variable (`GOOGLE_PROJECT_ID`)" + ) + self.api_timeout = timeout + self.safety_settings = { + generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH, + generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH, + generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH, + generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH, + } + # Parameters + tolerance = 1e-7 + self.temperature = temperature + if self.temperature < tolerance: + # Currently, the model inference fails if temperature + # is exactly 0, so we nudge it slightly to work around + # the issue + self.temperature += tolerance + self.top_p = top_p + self.max_tokens = max_tokens + + super(Gemini, self).__init__( + retry_exceptions=(TimeoutError, GeminiFailure), **kwargs + ) + vertexai.init(project=self.project_id, location="us-central1") + # self.client = GenerativeModel(self.model_name) + + def summarize_response(self, response): + """Returns the "outputs" key's value, if available""" + if "messages" in response: + return response["messages"] + + return response + + def prompt(self, processed_input): + """ + Gemini API Implementation + + Arguments + --------- + processed_input : list + Must be list of dictionaries, where each dictionary has two keys; + "role" defines a role in the chat (e.g. "system", "user") and + "content" defines the actual message for that turn + + Returns + ------- + response : Gemini API response + Response from the Gemini server + + Raises + ------ + GeminiFailure : Exception + This method raises this exception if the server responded with a non-ok + response + """ + # headers = { + # "Content-Type": "application/json", + # "Authorization": "Bearer " + self.api_key, + # } + # body = { + # "input_data": { + # "input_string": processed_input, + # "parameters": { + # "max_tokens": self.max_tokens, + # "temperature": self.temperature, + # "top_p": self.top_p, + # }, + # } + # } + generation_config = { + "max_output_tokens": 8192, + "temperature": 0, + "top_p": 0.95, + } + + try: + client = GenerativeModel( + self.model_name, system_instruction=[processed_input[0]["content"]] + ) + response = client.generate_content( + [processed_input[1]["content"]], + generation_config=generation_config, + safety_settings=self.safety_settings, + ) + + except Exception as e: + raise GeminiFailure( + "processing", + "Processing failed with status: {}".format(e), + ) + + # Parse the final response + try: + # response_data = response.json() + response_data = [response.to_dict() for response in response.candidates] + except Exception as e: + raise GeminiFailure( + "processing", + "Processing failed: {}".format(response), + ) + + return response_data diff --git a/llmebench/models/__init__.py b/llmebench/models/__init__.py index a97f2a79..c87c652e 100644 --- a/llmebench/models/__init__.py +++ b/llmebench/models/__init__.py @@ -1,6 +1,7 @@ from .Anthropic import AnthropicModel from .AzureModel import AzureModel from .FastChat import FastChatModel +from .Gemini import Gemini from .HuggingFaceInferenceAPI import HuggingFaceInferenceAPIModel, HuggingFaceTaskTypes from .OpenAI import LegacyOpenAIModel, OpenAIModel, OpenAIO1Model from .Petals import PetalsModel From 0a5b7ae1dd85858cdbd9e1a43a82aecbde75cd83 Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Wed, 18 Jun 2025 04:28:32 -0300 Subject: [PATCH 2/6] rename model class --- assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py | 4 ++-- llmebench/models/Gemini.py | 4 ++-- llmebench/models/__init__.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py b/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py index 20d03eb8..2f9e11b7 100644 --- a/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py +++ b/assets/en/QA/bd/MultiNativQA_Gemini_ZeroShot.py @@ -2,7 +2,7 @@ import re from llmebench.datasets import MultiNativQADataset -from llmebench.models import Gemini +from llmebench.models import GeminiModel from llmebench.tasks import MultiNativQATask @@ -19,7 +19,7 @@ def config(): return { "dataset": MultiNativQADataset, "task": MultiNativQATask, - "model": Gemini, + "model": GeminiModel, "general_args": {"test_split": "english_bd"}, } diff --git a/llmebench/models/Gemini.py b/llmebench/models/Gemini.py index 83699c18..ce3cac08 100644 --- a/llmebench/models/Gemini.py +++ b/llmebench/models/Gemini.py @@ -28,7 +28,7 @@ def __str__(self): ) -class Gemini(ModelBase): +class GeminiModel(ModelBase): """ Gemini Model interface. @@ -92,7 +92,7 @@ def __init__( self.top_p = top_p self.max_tokens = max_tokens - super(Gemini, self).__init__( + super(GeminiModel, self).__init__( retry_exceptions=(TimeoutError, GeminiFailure), **kwargs ) vertexai.init(project=self.project_id, location="us-central1") diff --git a/llmebench/models/__init__.py b/llmebench/models/__init__.py index c87c652e..b02112b5 100644 --- a/llmebench/models/__init__.py +++ b/llmebench/models/__init__.py @@ -1,7 +1,7 @@ from .Anthropic import AnthropicModel from .AzureModel import AzureModel from .FastChat import FastChatModel -from .Gemini import Gemini +from .Gemini import GeminiModel from .HuggingFaceInferenceAPI import HuggingFaceInferenceAPIModel, HuggingFaceTaskTypes from .OpenAI import LegacyOpenAIModel, OpenAIModel, OpenAIO1Model from .Petals import PetalsModel From 0bb442874ea663e26db2dd5085e777a6b039bbd4 Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Thu, 19 Jun 2025 01:13:18 -0300 Subject: [PATCH 3/6] add test case --- tests/models/test_Gemini.py | 95 +++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tests/models/test_Gemini.py diff --git a/tests/models/test_Gemini.py b/tests/models/test_Gemini.py new file mode 100644 index 00000000..7d189962 --- /dev/null +++ b/tests/models/test_Gemini.py @@ -0,0 +1,95 @@ +import unittest +from unittest.mock import patch + +from llmebench import Benchmark +from llmebench.models import GeminiModel + +from llmebench.utils import is_fewshot_asset + + +class TestAssetsForGeminiDepModelPrompts(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Load the benchmark assets + benchmark = Benchmark(benchmark_dir="assets") + all_assets = benchmark.find_assets() + + # Filter out assets not using the Petals model + cls.assets = [ + asset for asset in all_assets if asset["config"]["model"] in [GeminiModel] + ] + + def test_gemini_deployed_model_prompts(self): + "Test if all assets using this model return data in an appropriate format for prompting" + + n_shots = 3 # Sample for few shot prompts + + for asset in self.assets: + with self.subTest(msg=asset["name"]): + config = asset["config"] + dataset_args = config.get("dataset_args", {}) + dataset_args["data_dir"] = "" + dataset = config["dataset"](**dataset_args) + data_sample = dataset.get_data_sample() + if is_fewshot_asset(config, asset["module"].prompt): + prompt = asset["module"].prompt( + data_sample["input"], + [data_sample for _ in range(n_shots)], + ) + else: + prompt = asset["module"].prompt(data_sample["input"]) + + self.assertIsInstance(prompt, list) + + for message in prompt: + self.assertIsInstance(message, dict) + self.assertIn("role", message) + self.assertIsInstance(message["role"], str) + self.assertIn("content", message) + self.assertIsInstance(message["content"], (str, list)) + + +class TestGeminiDepModelConfig(unittest.TestCase): + def test_gemini_deployed_model_config(self): + "Test if model config parameters passed as arguments are used" + model = GeminiModel( + project_id="test_project_id", api_key="secret-key", model_name="gemini-test" + ) + + self.assertEqual(model.project_id, "test_project_id") + self.assertEqual(model.api_key, "secret-key") + self.assertEqual(model.model_name, "gemini-test") + + @patch.dict( + "os.environ", + { + "GOOGLE_PROJECT_ID": "test_project_id", + "GOOGLE_API_KEY": "secret-key", + "MODEL": "gemini-test", + }, + ) + def test_gemini_deployed_model_config_env_var(self): + "Test if model config parameters passed as environment variables are used" + model = GeminiModel() + + self.assertEqual(model.project_id, "test_project_id") + self.assertEqual(model.api_key, "secret-key") + self.assertEqual(model.model_name, "gemini-test") + + @patch.dict( + "os.environ", + { + "GOOGLE_PROJECT_ID": "test_project_id", + "GOOGLE_API_KEY": "secret-env-key", + "MODEL": "gemini-test", + }, + ) + def test_gemini_deployed_model_config_priority(self): + "Test if model config parameters passed directly get priority" + model = GeminiModel( + project_id="test_project_id", api_key="secret-key", model_name="gemini_test" + ) + + self.assertEqual(model.project_id, "test_project_id") + self.assertEqual(model.api_key, "secret-key") + self.assertEqual(model.model_name, "gemini_test") From 1d84102b603ba3835b0b94a316c1c6f07702eb99 Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Thu, 19 Jun 2025 01:20:29 -0300 Subject: [PATCH 4/6] update requirements --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index fb0ac9f4..77c6ba63 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,6 +34,7 @@ install_requires = rouge-score==0.1.2 absl-py==2.1.0 GitPython==3.1.43 + google-cloud-aiplatform==1.97.0 # For now, make sure NumPy 2 is not installed numpy<2 From 3a73e47f99f1eb2635cb3ea46fc22e1d9fa40ef1 Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Thu, 19 Jun 2025 01:24:28 -0300 Subject: [PATCH 5/6] update requirements --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 77c6ba63..8c9a2420 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,7 @@ install_requires = rouge-score==0.1.2 absl-py==2.1.0 GitPython==3.1.43 - google-cloud-aiplatform==1.97.0 + google-cloud-aiplatform>=1.90.0 # For now, make sure NumPy 2 is not installed numpy<2 From 4929c8db17864f8cc9b6566217a7a7e0ed0546b6 Mon Sep 17 00:00:00 2001 From: Arid Hasan Date: Thu, 19 Jun 2025 02:31:19 -0300 Subject: [PATCH 6/6] remove hard coded parameters --- llmebench/models/Gemini.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmebench/models/Gemini.py b/llmebench/models/Gemini.py index ce3cac08..934c675e 100644 --- a/llmebench/models/Gemini.py +++ b/llmebench/models/Gemini.py @@ -142,9 +142,9 @@ def prompt(self, processed_input): # } # } generation_config = { - "max_output_tokens": 8192, - "temperature": 0, - "top_p": 0.95, + "max_output_tokens": self.max_tokens, + "temperature": self.temperature, + "top_p": self.top_p, } try: