From 152e4cbe6541eed03fce8443c9cc98007edcaa6e Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Thu, 14 May 2026 14:22:55 +0300 Subject: [PATCH 01/22] fix: Replace huggingface-cli login with HF_TOKEN env var in catalog_preparation CI The huggingface-cli command was not found on PATH in CI, causing the login step to fail. Using the HF_TOKEN environment variable is the recommended approach for CI and avoids PATH issues entirely. Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index b420165116..843fb06dd5 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -24,6 +24,7 @@ jobs: HF_HUB_DOWNLOAD_TIMEOUT: 60 HF_HUB_ETAG_TIMEOUT: 60 TQDM_DISABLE: "True" + HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} strategy: matrix: @@ -44,11 +45,6 @@ jobs: with: ssh-private-key: ${{ secrets.LLMEVALKIT_SSH_KEY }} - - name: Hugging Face Login - run: | - for i in {1..5}; do - huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i)); - done - name: Run Tests run: | modulo="${{ matrix.modulo }}" From 9649da87991be44c0b8103a7b485a1b18061f73c Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Thu, 14 May 2026 14:34:57 +0300 Subject: [PATCH 02/22] fix: Use keyword arguments in DatasetBuilder.as_dataset() call The newer datasets library changed as_dataset() to accept fewer positional arguments. Pass all arguments as keyword arguments for forward compatibility. Signed-off-by: Yoav Katz --- src/unitxt/dataset.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py index 94529f42ff..72b40e0b8c 100644 --- a/src/unitxt/dataset.py +++ b/src/unitxt/dataset.py @@ -164,6 +164,11 @@ def as_dataset( """ return ( super() - .as_dataset(split, run_post_process, verification_mode, in_memory) + .as_dataset( + split=split, + run_post_process=run_post_process, + verification_mode=verification_mode, + in_memory=in_memory, + ) .with_transform(loads_batch) ) From 0f500a0df6fc7fbdefa5a9972da8f1355c3e4083 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 10:49:12 +0300 Subject: [PATCH 03/22] fix: Migrate arena-hard card to lmarena-ai/arena-hard-viewer and fix WeightedWinRateCorrelation metric The old lmsys/arena-hard-browser HF space is no longer available. This migrates to the replacement space lmarena-ai/arena-hard-viewer with adapted processing steps for its different data format (flat prompt field, messages-based answers, uid instead of question_id). Also fixes a bug in WeightedWinRateCorrelation where pd.DataFrame columns initialized as object dtype caused scipy pearsonr to fail with newer numpy/scipy versions. Signed-off-by: Yoav Katz --- prepare/cards/arena_hard/common.py | 24 +++++++++++----- .../both_games_gpt4_judge.py | 4 +-- .../both_games_gpt_4_judge.json | 4 +-- .../arena_hard_hf_space_processing_steps.json | 28 ++++++++++++++++--- src/unitxt/metrics.py | 6 ++-- 5 files changed, 49 insertions(+), 17 deletions(-) diff --git a/prepare/cards/arena_hard/common.py b/prepare/cards/arena_hard/common.py index c7cb4567f6..d55f6772af 100644 --- a/prepare/cards/arena_hard/common.py +++ b/prepare/cards/arena_hard/common.py @@ -5,6 +5,7 @@ Cast, Copy, FilterByCondition, + RemoveFields, Rename, SelectFields, Set, @@ -18,18 +19,22 @@ arena_hard_hf_space_processing_steps = SequentialOperator( steps=[ # region Question file - Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]), + Rename( + field_to_field={"uid": "question_id", "cluster": "category"}, + apply_to_streams=["questions"], + ), Copy( - field_to_field={"turns/0/content": "model_input"}, + field_to_field={"prompt": "model_input"}, apply_to_streams=["questions"], ), # endregion # region Answers file processing + Rename( + field_to_field={"uid": "question_id", "model": "model_id"}, + apply_to_streams=["model_answer"], + ), Copy( - field_to_field={ - "choices/0/turns/0/content": "model_output", - "choices/0/turns/0/token_len": "model_output_token_len", - }, + field_to_field={"messages/1/content/answer": "model_output"}, apply_to_streams=["model_answer"], ), Apply( @@ -52,9 +57,14 @@ apply_to_streams=["judgment"], ), Rename( - field_to_field={"model": "model_2", "judge": "judge_model_id"}, + field_to_field={ + "uid": "question_id", + "model": "model_2", + "judge": "judge_model_id", + }, apply_to_streams=["judgment"], ), + RemoveFields(fields=["category"], apply_to_streams=["judgment"]), Set(fields={"model_1": "gpt-4-0314"}, apply_to_streams=["judgment"]), Cast( field="judge_input_model_1_ordered_first", diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py index b572ec4cc1..b0cd78c6ab 100644 --- a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py +++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py @@ -15,8 +15,8 @@ card = TaskCard( loader=LoadFromHFSpace( - space_name="lmsys/arena-hard-browser", - revision="03b91ca", # May 26, 2024 + space_name="lmarena-ai/arena-hard-viewer", + revision="56c7614", # Apr 23, 2025 - first commit with v0.1 data in new space data_files={ "questions": "data/arena-hard-v0.1/question.jsonl", "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json index 33e4f68eaf..31fc2dfc60 100644 --- a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json +++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json @@ -2,8 +2,8 @@ "__type__": "task_card", "loader": { "__type__": "load_from_hf_space", - "space_name": "lmsys/arena-hard-browser", - "revision": "03b91ca", + "space_name": "lmarena-ai/arena-hard-viewer", + "revision": "56c7614", "data_files": { "questions": "data/arena-hard-v0.1/question.jsonl", "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", diff --git a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json index fcfed9e6b3..882f53279c 100644 --- a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json +++ b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json @@ -4,7 +4,8 @@ { "__type__": "rename", "field_to_field": { - "cluster": "group" + "uid": "question_id", + "cluster": "category" }, "apply_to_streams": [ "questions" @@ -13,17 +14,26 @@ { "__type__": "copy", "field_to_field": { - "turns/0/content": "model_input" + "prompt": "model_input" }, "apply_to_streams": [ "questions" ] }, + { + "__type__": "rename", + "field_to_field": { + "uid": "question_id", + "model": "model_id" + }, + "apply_to_streams": [ + "model_answer" + ] + }, { "__type__": "copy", "field_to_field": { - "choices/0/turns/0/content": "model_output", - "choices/0/turns/0/token_len": "model_output_token_len" + "messages/1/content/answer": "model_output" }, "apply_to_streams": [ "model_answer" @@ -57,6 +67,7 @@ { "__type__": "rename", "field_to_field": { + "uid": "question_id", "model": "model_2", "judge": "judge_model_id" }, @@ -64,6 +75,15 @@ "judgment" ] }, + { + "__type__": "remove_fields", + "fields": [ + "category" + ], + "apply_to_streams": [ + "judgment" + ] + }, { "__type__": "set", "fields": { diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index bf88fc2a52..d87f152400 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -1929,10 +1929,12 @@ def compute( pred_df_win_rate, ref_df_win_rate, on="model", suffixes=("_pred", "_ref") ) pearson_corr, _ = pearsonr( - merged_df["win_rate_pred"], merged_df["win_rate_ref"] + merged_df["win_rate_pred"].astype(float), + merged_df["win_rate_ref"].astype(float), ) spearman_corr, _ = spearmanr( - merged_df["win_rate_pred"], merged_df["win_rate_ref"] + merged_df["win_rate_pred"].astype(float), + merged_df["win_rate_ref"].astype(float), ) return {"pearson_corr": pearson_corr, "spearman_corr": spearman_corr} From 9f9cf0909528a4904ed6f11c84d4878e3c626804 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 11:11:38 +0300 Subject: [PATCH 04/22] fix: Force float32 in Perplexity metric to prevent NaN with float16 models Models like bloom-560M default to float16, causing numerical overflow in attention computations with padded inputs. Forcing float32 ensures stable perplexity scores regardless of model's default dtype. Signed-off-by: Yoav Katz Signed-off-by: Yoav Katz --- src/unitxt/metrics.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index d87f152400..b59ff914f1 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -5105,7 +5105,11 @@ def __init__(self, model_name, single_token_mode): model_path = self.model_name if settings.hf_offline_models_path is not None: model_path = os.path.join(settings.hf_offline_models_path, model_path) - self.model = self.model_class().from_pretrained(model_path).to(self.device) + self.model = ( + self.model_class() + .from_pretrained(model_path, dtype=torch.float32) + .to(self.device) + ) self.tokenizer = AutoTokenizer.from_pretrained(model_path) if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id From 777fd44aa36cee77eef192ff1a68ef773347857b Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 12:18:12 +0300 Subject: [PATCH 05/22] fix: Remove run_post_process and verification_mode params for datasets>=4.8.5 The `datasets` library removed `run_post_process` and `verification_mode` parameters from `DatasetBuilder.as_dataset()` in version 4.8.5. These parameters were already non-functional in 4.8.4 (the `_post_process` method and verification logic had been removed from the implementation), but 4.8.5 cleaned up the signature to match, causing a TypeError. - Remove `run_post_process=False` and `verification_mode="no_checks"` from the call site in api.py - Remove both parameters from the Dataset.as_dataset() override signature and the super() call in dataset.py No behavioral change: post-processing and verification were already no-ops in recent datasets versions. Signed-off-by: Yoav Katz --- src/unitxt/api.py | 4 +--- src/unitxt/dataset.py | 15 +-------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/unitxt/api.py b/src/unitxt/api.py index 23de331bd4..b33db9dda2 100644 --- a/src/unitxt/api.py +++ b/src/unitxt/api.py @@ -221,9 +221,7 @@ def _source_to_dataset( if streaming: return ds_builder.as_streaming_dataset(split=split) - return ds_builder.as_dataset( - split=split, run_post_process=False, verification_mode="no_checks" - ) + return ds_builder.as_dataset(split=split) except DatasetGenerationError as e: raise e.__cause__ diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py index 72b40e0b8c..42b94dd068 100644 --- a/src/unitxt/dataset.py +++ b/src/unitxt/dataset.py @@ -126,8 +126,6 @@ def as_streaming_dataset( def as_dataset( self, split: Optional[datasets.Split] = None, - run_post_process=True, - verification_mode: Optional[Union[datasets.VerificationMode, str]] = None, in_memory=False, ) -> Union[datasets.Dataset, datasets.DatasetDict]: """Return a Dataset for the specified split. @@ -135,12 +133,6 @@ def as_dataset( Args: split (`datasets.Split`): Which subset of the data to return. - run_post_process (`bool`, defaults to `True`): - Whether to run post-processing dataset transforms and/or add - indexes. - verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`): - Verification mode determining the checks to run on the - downloaded/processed dataset information (checksums/size/splits/...). in_memory (`bool`, defaults to `False`): Whether to copy the data in-memory. @@ -164,11 +156,6 @@ def as_dataset( """ return ( super() - .as_dataset( - split=split, - run_post_process=run_post_process, - verification_mode=verification_mode, - in_memory=in_memory, - ) + .as_dataset(split=split, in_memory=in_memory) .with_transform(loads_batch) ) From 44f06a6d0f033b2e9814e93018ee7033bb97a834 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 12:38:53 +0300 Subject: [PATCH 06/22] fix: Replace assertWarns with catch_warnings to avoid transformers import bug Python 3.10's assertWarns() iterates sys.modules and triggers transformers' lazy loader to import aria.image_processing_aria which requires torchvision. Using warnings.catch_warnings(record=True) avoids this module iteration. Signed-off-by: Yoav Katz --- tests/library/test_artifact.py | 13 ++++++++++--- tests/library/test_operators.py | 16 +++++++++++----- tests/library/test_unified_warnings_decorator.py | 8 ++++++-- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/library/test_artifact.py b/tests/library/test_artifact.py index b5b8f38581..c5113956d6 100644 --- a/tests/library/test_artifact.py +++ b/tests/library/test_artifact.py @@ -1,6 +1,7 @@ import json import os import tempfile +import warnings from unitxt.artifact import ( Artifact, @@ -346,13 +347,15 @@ def test_artifact_link_with_deprecation_warning(self): overwrite=True, ) - with self.assertWarns(DeprecationWarning): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") rename_fields = ArtifactLink( to="rename.for.test.artifact.link", __deprecated_msg__="Artifact is deprecated. " "'rename.for.test.artifact.link' is now instantiated instead. " "\nIn the future, please use 'rename.for.test.artifact.link'.", ) + self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w)) add_to_catalog( rename_fields, @@ -361,8 +364,10 @@ def test_artifact_link_with_deprecation_warning(self): overwrite=True, ) - with self.assertWarns(DeprecationWarning): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") artifact, _ = fetch_artifact("renamefields.for.test.artifact.link") + self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w)) self.assertDictEqual(rename.to_dict(), artifact.to_dict()) # test again, now employing add_link_to_catalog() @@ -374,8 +379,10 @@ def test_artifact_link_with_deprecation_warning(self): overwrite=True, ) - with self.assertWarns(DeprecationWarning): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") artifact = get_from_catalog("renamefields3") + self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w)) self.assertDictEqual(rename.to_dict(), artifact.to_dict()) def test_artifact_link_with_overwrites(self): diff --git a/tests/library/test_operators.py b/tests/library/test_operators.py index eac31852a8..ce07922e30 100644 --- a/tests/library/test_operators.py +++ b/tests/library/test_operators.py @@ -1,6 +1,7 @@ import json import os import tempfile +import warnings from collections import Counter from typing import Any @@ -2334,12 +2335,17 @@ def test_rename(self): tester=self, ) - with self.assertWarns(DeprecationWarning) as dw: + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") Rename(field_to_field={"a/b/c/d": "a/b/c/f"}, use_query=True) - self.assertEqual( - "Field 'use_query' is deprecated. From now on, default behavior is compatible to use_query=True. Please remove this field from your code.", - dw.warnings[0].message.args[0], - ) + deprecation_warnings = [ + x for x in w if issubclass(x.category, DeprecationWarning) + ] + self.assertTrue(len(deprecation_warnings) > 0) + self.assertEqual( + "Field 'use_query' is deprecated. From now on, default behavior is compatible to use_query=True. Please remove this field from your code.", + deprecation_warnings[0].message.args[0], + ) def test_add(self): check_operator( diff --git a/tests/library/test_unified_warnings_decorator.py b/tests/library/test_unified_warnings_decorator.py index bd32f4da9c..90760addf6 100644 --- a/tests/library/test_unified_warnings_decorator.py +++ b/tests/library/test_unified_warnings_decorator.py @@ -1,4 +1,5 @@ import unittest +import warnings from unitxt.deprecation_utils import warn_on_call @@ -10,10 +11,13 @@ class TestClass: def __init__(self, name): self.name = name - with self.assertWarns(UserWarning) as warning_context: + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") obj = TestClass("Initialized_object") - self.assertEqual(str(warning_context.warning), "Class object initialized!") + user_warnings = [x for x in w if issubclass(x.category, UserWarning)] + self.assertTrue(len(user_warnings) > 0) + self.assertEqual(str(user_warnings[0].message), "Class object initialized!") self.assertEqual(obj.name, "Initialized_object") def test_warning_called_on_instance_creation(self): From ee5909890905e8b62a9819dd9d46c04a40b3ca82 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 13:00:54 +0300 Subject: [PATCH 07/22] fix: Cap tokenizer model_max_length in BertScore to prevent OverflowError The tokenizers Rust backend (>=0.22) now validates integer sizes, causing DeBERTa's absurd model_max_length (1e30) to overflow. Use BERTScorer directly and cap model_max_length to the model's max_position_embeddings. Signed-off-by: Yoav Katz --- src/unitxt/metrics.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index b59ff914f1..c7f22ea41a 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4429,13 +4429,33 @@ def prepare(self): super().prepare() self.bertscore = None + def _get_scorer(self): + from bert_score import BERTScorer + + if self.bertscore is None: + self.bertscore = BERTScorer( + model_type=self.model_name, + num_layers=self.model_layer, + batch_size=self.batch_size, + device=self.get_device(), + ) + # Some models (e.g. DeBERTa) report an absurdly large + # model_max_length that overflows the tokenizers Rust backend. + # Cap it to the model's actual max_position_embeddings. + tokenizer = self.bertscore._tokenizer + if tokenizer.model_max_length > 1_000_000: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(self.model_name) + tokenizer.model_max_length = getattr( + config, "max_position_embeddings", 512 + ) + return self.bertscore + def map_stream( self, evaluation_inputs_stream: Generator[EvaluationInput[str], None, None] ): - from evaluate import load - - if self.bertscore is None: - self.bertscore = load("bertscore", experiment_id=str(uuid.uuid4())) + scorer = self._get_scorer() predictions = [] references = [] @@ -4443,18 +4463,15 @@ def map_stream( predictions.append(prediction) references.append(reference) - results = self.bertscore.compute( - predictions=predictions, - references=references, + (precisions, recalls, f1s) = scorer.score( + cands=predictions, + refs=references, batch_size=self.batch_size, - device=self.get_device(), - model_type=self.model_name, - num_layers=self.model_layer, ) intermediates = [] for precision, recall, f1 in zip( - results["precision"], results["recall"], results["f1"] + precisions.tolist(), recalls.tolist(), f1s.tolist() ): intermediates.append( { From 62c56e92493fadc437e85c2f0aedf0485fc717cb Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 16:23:43 +0300 Subject: [PATCH 08/22] fix: Migrate remaining arena-hard cards to lmarena-ai/arena-hard-viewer Update first_game_only and both_games_mean_judgment cards to use the new HF space, matching the migration done for both_games_gpt4_judge. Signed-off-by: Yoav Katz --- .../both_games_mean_judgment_gpt4_judge.py | 4 ++-- .../pairwise_comparative_rating/first_game_only_gpt4_judge.py | 4 ++-- .../both_games_mean_judgment_gpt4_judge.json | 4 ++-- .../first_game_only_gpt_4_judge.json | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py index 9be246a750..c814bc4b23 100644 --- a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py +++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py @@ -16,8 +16,8 @@ card = TaskCard( loader=LoadFromHFSpace( - space_name="lmsys/arena-hard-browser", - revision="03b91ca", # May 26, 2024 + space_name="lmarena-ai/arena-hard-viewer", + revision="56c7614", # Apr 23, 2025 - first commit with v0.1 data in new space data_files={ "questions": "data/arena-hard-v0.1/question.jsonl", "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py index 82936196e1..e0ce78ee44 100644 --- a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py +++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py @@ -13,8 +13,8 @@ card = TaskCard( loader=LoadFromHFSpace( - space_name="lmsys/arena-hard-browser", - revision="03b91ca", # May 26, 2024 + space_name="lmarena-ai/arena-hard-viewer", + revision="56c7614", # Apr 23, 2025 - first commit with v0.1 data in new space data_files={ "questions": "data/arena-hard-v0.1/question.jsonl", "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json index 8e44a52bd9..e9470a14e7 100644 --- a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json +++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json @@ -2,8 +2,8 @@ "__type__": "task_card", "loader": { "__type__": "load_from_hf_space", - "space_name": "lmsys/arena-hard-browser", - "revision": "03b91ca", + "space_name": "lmarena-ai/arena-hard-viewer", + "revision": "56c7614", "data_files": { "questions": "data/arena-hard-v0.1/question.jsonl", "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json index d5f928851b..d4160dcf7c 100644 --- a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json +++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json @@ -2,8 +2,8 @@ "__type__": "task_card", "loader": { "__type__": "load_from_hf_space", - "space_name": "lmsys/arena-hard-browser", - "revision": "03b91ca", + "space_name": "lmarena-ai/arena-hard-viewer", + "revision": "56c7614", "data_files": { "questions": "data/arena-hard-v0.1/question.jsonl", "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl", From 01f7c4ff1897a0bd02028407e9e3c6a872164cec Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 18:30:41 +0300 Subject: [PATCH 09/22] fix: Log each preparation file at CRITICAL level for CI visibility Signed-off-by: Yoav Katz --- tests/catalog/test_preparation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/catalog/test_preparation.py b/tests/catalog/test_preparation.py index 163fb883d2..b24a5f14f1 100644 --- a/tests/catalog/test_preparation.py +++ b/tests/catalog/test_preparation.py @@ -51,7 +51,7 @@ def test_preparations(self): for file in all_preparation_files: passed = True error = None - logger.info( + logger.critical( "\n_____________________________________________\n" f" Testing preparation file:\n {file}." "\n_____________________________________________\n" From 77defee0dca757953b1397e670d0b76479e8115c Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 20:28:35 +0300 Subject: [PATCH 10/22] fix: Enable verbose logging in BertScore to debug CI hang BertScore scorer.score() hangs silently in CI. Adding verbose=True will show per-batch progress to identify where it stalls. Signed-off-by: Yoav Katz --- src/unitxt/metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index c7f22ea41a..64ac09be19 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4467,6 +4467,7 @@ def map_stream( cands=predictions, refs=references, batch_size=self.batch_size, + verbose=True, ) intermediates = [] From 7e352dd0473baeb3bb35790d7e64ecfeb6af3126 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 May 2026 21:01:48 +0300 Subject: [PATCH 11/22] fix: Disable test_card for numeric_nlg and coqa to unblock CI Commenting out test_card calls that hang in CI to determine if the issue is specific to these cards or affects all BertScore usage. Signed-off-by: Yoav Katz --- prepare/cards/coqa.py | 5 ++--- prepare/cards/numeric_nlg.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/prepare/cards/coqa.py b/prepare/cards/coqa.py index 33675e42f9..bc38b6e194 100644 --- a/prepare/cards/coqa.py +++ b/prepare/cards/coqa.py @@ -3,7 +3,6 @@ from unitxt.collections_operators import Dictify, DuplicateBySubLists, Get, Wrap from unitxt.dialog_operators import SerializeDialog from unitxt.operators import Copy, ZipFieldValues -from unitxt.test_utils.card import test_card card = TaskCard( loader=LoadHF(path="stanfordnlp/coqa"), @@ -58,7 +57,7 @@ ), ) -test_card(card) +# test_card(card) add_to_catalog(card, "cards.coqa.qa", overwrite=True) card = TaskCard( @@ -106,5 +105,5 @@ ), ) -test_card(card) +# test_card(card) add_to_catalog(card, "cards.coqa.completion", overwrite=True) diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py index 47438e53da..f5290edf70 100644 --- a/prepare/cards/numeric_nlg.py +++ b/prepare/cards/numeric_nlg.py @@ -7,7 +7,6 @@ ) from unitxt.catalog import add_to_catalog from unitxt.operators import Copy -from unitxt.test_utils.card import test_card card = TaskCard( loader=LoadHF(path="kasnerz/numericnlg"), @@ -37,5 +36,5 @@ }, ) -test_card(card, num_demos=2, demos_pool_size=5, strict=False) +# test_card(card, num_demos=2, demos_pool_size=5, strict=False) add_to_catalog(card, "cards.numeric_nlg", overwrite=True) From c85d02de1cc8531b50f1a13b7aedaf74a24ef707 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Tue, 19 May 2026 15:39:02 +0300 Subject: [PATCH 12/22] debug: Enable progress bars in catalog_preparation CI to diagnose hang Remove TQDM_DISABLE and HF_DATASETS_DISABLE_PROGRESS_BARS so we can see download progress and identify where ffqa_filtered stalls. Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index 843fb06dd5..749c4f130b 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -20,10 +20,8 @@ jobs: UNITXT_DEFAULT_VERBOSITY: error DATASETS_VERBOSITY: error HF_HUB_VERBOSITY: error - HF_DATASETS_DISABLE_PROGRESS_BARS: "True" HF_HUB_DOWNLOAD_TIMEOUT: 60 HF_HUB_ETAG_TIMEOUT: 60 - TQDM_DISABLE: "True" HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} strategy: From 4534314d5c426c0c69b5fa832c3edfa4356fe53d Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Tue, 19 May 2026 16:36:57 +0300 Subject: [PATCH 13/22] fix: Disable test_card for ffqa_filtered to unblock CI The abacusai/WikiQA-Free_Form_QA dataset's builder script hangs indefinitely on GitHub Actions runners, causing the 30-minute timeout to trigger. Signed-off-by: Yoav Katz --- prepare/cards/ffqa_filtered.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py index 93683b7a6a..bf5d5c4236 100644 --- a/prepare/cards/ffqa_filtered.py +++ b/prepare/cards/ffqa_filtered.py @@ -11,7 +11,6 @@ ListFieldValues, Set, ) -from unitxt.test_utils.card import test_card """Filtered version of the WikiQA-Free_Form_QA dataset. If you would like to use the full dataset, please copy and modify this card as ffqa.py. @@ -119,7 +118,7 @@ def add_card(split: str): ), ) - test_card(card) + # test_card(card) add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True) From 5b8da918c4dc3f775770f15f4893daf3c043ed03 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Tue, 19 May 2026 17:27:23 +0300 Subject: [PATCH 14/22] fix: Increase catalog_preparation partitions to 10 and re-enable progress suppression Increase parallelism from 8 to 10 partitions to reduce per-job load and re-enable TQDM_DISABLE/HF_DATASETS_DISABLE_PROGRESS_BARS since they didn't help diagnose the hang. Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index 749c4f130b..d99d7ac509 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -20,13 +20,15 @@ jobs: UNITXT_DEFAULT_VERBOSITY: error DATASETS_VERBOSITY: error HF_HUB_VERBOSITY: error + HF_DATASETS_DISABLE_PROGRESS_BARS: "True" HF_HUB_DOWNLOAD_TIMEOUT: 60 HF_HUB_ETAG_TIMEOUT: 60 + TQDM_DISABLE: "True" HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} strategy: matrix: - modulo: [0,1,2,3,4,5,6,7] + modulo: [0,1,2,3,4,5,6,7,8,9] steps: - uses: actions/checkout@v5 @@ -47,7 +49,7 @@ jobs: run: | modulo="${{ matrix.modulo }}" echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY - echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh + echo "sed -i 's/^num_par = 1 /num_par = 10 /' tests/catalog/test_preparation.py" > sedit.sh echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh sh sedit.sh python -m unittest tests.catalog.test_preparation From e2177a5e9c8c84d326c8e0513e168655bd491820 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Tue, 19 May 2026 17:53:16 +0300 Subject: [PATCH 15/22] debug: Remove pip cache from catalog_preparation to test if it affects hang Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index d99d7ac509..8d2d94ba93 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -36,7 +36,6 @@ jobs: - uses: actions/setup-python@v5 with: python-version: '3.10' - cache: 'pip' - name: Install Dependencies run: bash utils/install.sh From a3635166ccbd266254f9355df4cd2e3cb5bb4661 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 09:20:50 +0300 Subject: [PATCH 16/22] debug: Limit catalog_preparation to 4 parallel jobs to test HF rate limiting Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index 8d2d94ba93..70fb060a2d 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -27,6 +27,7 @@ jobs: HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} strategy: + max-parallel: 4 matrix: modulo: [0,1,2,3,4,5,6,7,8,9] From b592cd1b443bb81523d660a972e2bf891392e05a Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 10:06:31 +0300 Subject: [PATCH 17/22] debug: Add PYTHONUNBUFFERED=1 to see output before hang Without this, stdout is fully buffered in CI and gets lost when the process is killed by the 30-minute timeout. Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index 70fb060a2d..bfcbd3705c 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -17,6 +17,7 @@ jobs: timeout-minutes: 30 env: OS: ubuntu-latest + PYTHONUNBUFFERED: "1" UNITXT_DEFAULT_VERBOSITY: error DATASETS_VERBOSITY: error HF_HUB_VERBOSITY: error From fcbfb434f4973a0bd663fa93e032facb88a3d497 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 10:07:26 +0300 Subject: [PATCH 18/22] revert: Remove max-parallel limit since it didn't help Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index bfcbd3705c..71d5f75c3b 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -28,7 +28,6 @@ jobs: HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} strategy: - max-parallel: 4 matrix: modulo: [0,1,2,3,4,5,6,7,8,9] From 6616199609d728f36d7e32f5b650f68acb938065 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 11:17:09 +0300 Subject: [PATCH 19/22] fix: Update mtrag card URL after repo restructure The IBM/mt-rag-benchmark repo moved human/generation_tasks/ to mtrag-human/generation_tasks/. Signed-off-by: Yoav Katz --- prepare/cards/mtrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare/cards/mtrag.py b/prepare/cards/mtrag.py index 92126f18f1..fa4f0b3b67 100644 --- a/prepare/cards/mtrag.py +++ b/prepare/cards/mtrag.py @@ -20,7 +20,7 @@ card = TaskCard( loader=LoadJsonFile( files={ - "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl" + "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl" }, lines=True, data_classification_policy=["public"], From 995ca6daf83717c269e2c3f484276c7139f396cc Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 11:18:31 +0300 Subject: [PATCH 20/22] fix: Restore pip cache in catalog_preparation workflow Removing it was a debug experiment that didn't help. Signed-off-by: Yoav Katz --- .github/workflows/catalog_preparation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml index 71d5f75c3b..9c02481685 100644 --- a/.github/workflows/catalog_preparation.yml +++ b/.github/workflows/catalog_preparation.yml @@ -37,6 +37,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: '3.10' + cache: 'pip' - name: Install Dependencies run: bash utils/install.sh From 8e098b72e5d86cf74b83fcca64eff4e011aefa74 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 11:33:31 +0300 Subject: [PATCH 21/22] fix: Update mtrag catalog JSON with new URL Signed-off-by: Yoav Katz --- src/unitxt/catalog/cards/rag/mtrag.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unitxt/catalog/cards/rag/mtrag.json b/src/unitxt/catalog/cards/rag/mtrag.json index 6165500252..3926b37a6f 100644 --- a/src/unitxt/catalog/cards/rag/mtrag.json +++ b/src/unitxt/catalog/cards/rag/mtrag.json @@ -3,7 +3,7 @@ "loader": { "__type__": "load_json_file", "files": { - "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl" + "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl" }, "lines": true, "data_classification_policy": [ From 2cd9877fd2d92187039db7a0646ea232d02da86c Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 20 May 2026 12:08:21 +0300 Subject: [PATCH 22/22] fix: Re-enable test_card for numeric_nlg, coqa, and ffqa_filtered All three pass locally. The CI hangs are due to network issues on GitHub Actions runners, not code problems. Signed-off-by: Yoav Katz --- prepare/cards/coqa.py | 5 +++-- prepare/cards/ffqa_filtered.py | 3 ++- prepare/cards/numeric_nlg.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/prepare/cards/coqa.py b/prepare/cards/coqa.py index bc38b6e194..33675e42f9 100644 --- a/prepare/cards/coqa.py +++ b/prepare/cards/coqa.py @@ -3,6 +3,7 @@ from unitxt.collections_operators import Dictify, DuplicateBySubLists, Get, Wrap from unitxt.dialog_operators import SerializeDialog from unitxt.operators import Copy, ZipFieldValues +from unitxt.test_utils.card import test_card card = TaskCard( loader=LoadHF(path="stanfordnlp/coqa"), @@ -57,7 +58,7 @@ ), ) -# test_card(card) +test_card(card) add_to_catalog(card, "cards.coqa.qa", overwrite=True) card = TaskCard( @@ -105,5 +106,5 @@ ), ) -# test_card(card) +test_card(card) add_to_catalog(card, "cards.coqa.completion", overwrite=True) diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py index bf5d5c4236..93683b7a6a 100644 --- a/prepare/cards/ffqa_filtered.py +++ b/prepare/cards/ffqa_filtered.py @@ -11,6 +11,7 @@ ListFieldValues, Set, ) +from unitxt.test_utils.card import test_card """Filtered version of the WikiQA-Free_Form_QA dataset. If you would like to use the full dataset, please copy and modify this card as ffqa.py. @@ -118,7 +119,7 @@ def add_card(split: str): ), ) - # test_card(card) + test_card(card) add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True) diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py index f5290edf70..47438e53da 100644 --- a/prepare/cards/numeric_nlg.py +++ b/prepare/cards/numeric_nlg.py @@ -7,6 +7,7 @@ ) from unitxt.catalog import add_to_catalog from unitxt.operators import Copy +from unitxt.test_utils.card import test_card card = TaskCard( loader=LoadHF(path="kasnerz/numericnlg"), @@ -36,5 +37,5 @@ }, ) -# test_card(card, num_demos=2, demos_pool_size=5, strict=False) +test_card(card, num_demos=2, demos_pool_size=5, strict=False) add_to_catalog(card, "cards.numeric_nlg", overwrite=True)