From 32630fa6b466b4e5cc4074269be5a7ec1f27a36f Mon Sep 17 00:00:00 2001 From: Youssef-Elawa Date: Sun, 22 Jun 2025 13:27:04 +0300 Subject: [PATCH] Adding Palm Scripts Added Palm dataset and asset scripts, added the respective line in .init file, modified the benchmark.py to include utf-8 encoding (to work for windows). Testing was done using format_code.sh and run_test.sh scripts. Note: test_partial_path fails on Windows due to path separator mismatch (/ vs \\). --- assets/ar/QA/PalmQA_Fanar.py | 47 ++++++++++++++++++++++++++ llmebench/benchmark.py | 16 ++++----- llmebench/datasets/ArSAS.py | 2 +- llmebench/datasets/PalmQA.py | 62 ++++++++++++++++++++++++++++++++++ llmebench/datasets/__init__.py | 1 + 5 files changed, 119 insertions(+), 9 deletions(-) create mode 100644 assets/ar/QA/PalmQA_Fanar.py create mode 100644 llmebench/datasets/PalmQA.py diff --git a/assets/ar/QA/PalmQA_Fanar.py b/assets/ar/QA/PalmQA_Fanar.py new file mode 100644 index 00000000..a64dbe7b --- /dev/null +++ b/assets/ar/QA/PalmQA_Fanar.py @@ -0,0 +1,47 @@ +import json + +from llmebench.datasets import PaLMEvalDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import MultiNativQATask + + +def metadata(): + return { + "author": "UBC-NLP / Adapted by QCRI", + "model": "OpenAIModel", + "description": "Evaluation on PaLM dataset containing MSA and dialect instructions across 22 Arab countries.", + "scores": {}, + } + + +def config(): + return { + "dataset": PaLMEvalDataset, + "task": MultiNativQATask, + "model": OpenAIModel, + "general_args": {"test_split": "default"}, + } + + +def prompt(input_sample): + # Define the question prompt + question_prompt = f""" + Please use your expertise to answer the following Arabic question. Answer in Arabic. Please provide Answer only. No additional text. + + Question: {input_sample['question']} + + """ + + # Define the assistant prompt + assistant_prompt = """ + You are an Arabic AI assistant specialized in providing detailed and accurate answers across various fields. Your task is to deliver clear, concise, and relevant information. + """ + return [ + {"role": "user", "content": question_prompt}, + {"role": "assistant", "content": assistant_prompt}, + ] + + +def post_process(response): + content = response["choices"][0]["message"]["content"].strip() + return content diff --git a/llmebench/benchmark.py b/llmebench/benchmark.py index 56ff059b..5b6d2199 100644 --- a/llmebench/benchmark.py +++ b/llmebench/benchmark.py @@ -201,10 +201,10 @@ def run_benchmark(self, dry_run=False): predictions = [] num_processed = 0 - full_summary_fp = open(full_summary_path, "w") + full_summary_fp = open(full_summary_path, "w", encoding="utf-8") num_failed = 0 - failed_summary_fp = open(failed_summary_path, "w") + failed_summary_fp = open(failed_summary_path, "w", encoding="utf-8") for sample_idx, (input_sample, few_shot_examples) in enumerate( zip_longest(data, few_shots_data, fillvalue=None) @@ -222,7 +222,7 @@ def run_benchmark(self, dry_run=False): cache_payload["few_shot_examples"] = few_shot_examples if cache_path.exists() and not self.ignore_cache and not dry_run: - with open(cache_path, "r") as fp: + with open(cache_path, "r", encoding="utf-8") as fp: cache_payload = json.load(fp) summarized_payload = { @@ -258,7 +258,7 @@ def run_benchmark(self, dry_run=False): ) # Save the cache payload - with open(cache_path, "w") as fp: + with open(cache_path, "w", encoding="utf-8") as fp: json.dump(cache_payload, fp, ensure_ascii=False) full_summary_fp.close() @@ -280,7 +280,7 @@ def run_benchmark(self, dry_run=False): task_result_path = cache_dir / "results.json" - with open(task_result_path, "w") as fp: + with open(task_result_path, "w", encoding="utf-8") as fp: json.dump(task_results, fp, ensure_ascii=False) all_task_results[name] = task_results @@ -497,10 +497,10 @@ def main(): all_results_path = args.results_dir / "all_results.json" if not all_results_path.exists(): - with open(all_results_path, "w") as fp: + with open(all_results_path, "w", encoding="utf-8") as fp: json.dump({}, fp) - with open(all_results_path, "r") as fp: + with open(all_results_path, "r", encoding="utf-8") as fp: all_results = json.load(fp) for asset in assets: @@ -544,5 +544,5 @@ def main(): logging.error(f"{name} failed to run") traceback.print_exc() - with open(all_results_path, "w") as fp: + with open(all_results_path, "w", encoding="utf-8") as fp: json.dump(all_results, fp, ensure_ascii=False) diff --git a/llmebench/datasets/ArSAS.py b/llmebench/datasets/ArSAS.py index 49bf1259..435a0c39 100644 --- a/llmebench/datasets/ArSAS.py +++ b/llmebench/datasets/ArSAS.py @@ -33,7 +33,7 @@ def load_data(self, data_path, no_labels=False): data_path = self.resolve_path(data_path) data = [] - with open(data_path, "r") as fp: + with open(data_path, "r", encoding="utf-8") as fp: for line_idx, line in enumerate(fp): text, label = line.strip().split("\t") data.append({"input": text, "label": label, "line_number": line_idx}) diff --git a/llmebench/datasets/PalmQA.py b/llmebench/datasets/PalmQA.py new file mode 100644 index 00000000..97bd2950 --- /dev/null +++ b/llmebench/datasets/PalmQA.py @@ -0,0 +1,62 @@ +import json + +from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType + + +class PaLMEvalDataset(DatasetBase): + def __init__(self, **kwargs): + super(PaLMEvalDataset, self).__init__(**kwargs) + + @staticmethod + def get_data_sample(): + return { + "data_id": "1", + "input": { + "question": "من الملك الذي كان يتولى الحكم في الأردن عندما تم بناء مسجد الحسين؟" + }, + "label": "بني مسجد الحسين في عهد الملك عبد الله الثاني.", + } + + @staticmethod + def metadata(): + return { + "language": "ar", + "citation": "Refer to PaLM eval paper", + "link": "https://github.com/UBC-NLP/palm", + "license": "", + "splits": {"default": {"test": "test.jsonl"}}, + "task_type": TaskType.Other, + } + + def load_data(self, data_path, no_labels=False): + data_path = self.resolve_path(data_path) + data = [] + + with open(data_path, encoding="utf-8") as f: + for line in f: + obj = json.loads(line) + + # Concatenate instruction and input + instruction = obj.get("instruction") or "" + input_text = obj.get("input") or "" + + full_prompt = f"{instruction.strip()} {input_text.strip()}".strip() + + # Use "output" instead of "ideal" + output = obj.get("output") + if output is None: + print(f"Missing output for ID {obj.get('id')}") + output = "" + + label = output + + data.append( + { + "data_id": obj.get("id"), + "input": {"question": full_prompt}, + "label": label, + } + ) + + return data diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py index 643baf1f..bbc7a6ec 100644 --- a/llmebench/datasets/__init__.py +++ b/llmebench/datasets/__init__.py @@ -49,6 +49,7 @@ from .OSACT4SubtaskA import OSACT4SubtaskADataset from .OSACT4SubtaskB import OSACT4SubtaskBDataset from .PADT import PADTDataset +from .PalmQA import PaLMEvalDataset from .PIQA import PIQADataset from .QADI import QADIDataset from .QCRIDialectalArabicPOS import QCRIDialectalArabicPOSDataset