From 152e4cbe6541eed03fce8443c9cc98007edcaa6e Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Thu, 14 May 2026 14:22:55 +0300
Subject: [PATCH 01/22] fix: Replace huggingface-cli login with HF_TOKEN env
 var in catalog_preparation CI

The huggingface-cli command was not found on PATH in CI, causing the
login step to fail. Using the HF_TOKEN environment variable is the
recommended approach for CI and avoids PATH issues entirely.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index b420165116..843fb06dd5 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -24,6 +24,7 @@ jobs:
       HF_HUB_DOWNLOAD_TIMEOUT: 60
       HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
+      HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:
       matrix:
@@ -44,11 +45,6 @@ jobs:
       with:
         ssh-private-key: ${{ secrets.LLMEVALKIT_SSH_KEY }}
 
-    - name:  Hugging Face Login
-      run: |
-        for i in {1..5}; do
-          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
-        done
     - name: Run Tests
       run: |
         modulo="${{ matrix.modulo }}"

From 9649da87991be44c0b8103a7b485a1b18061f73c Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Thu, 14 May 2026 14:34:57 +0300
Subject: [PATCH 02/22] fix: Use keyword arguments in
 DatasetBuilder.as_dataset() call

The newer datasets library changed as_dataset() to accept fewer
positional arguments. Pass all arguments as keyword arguments for
forward compatibility.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/dataset.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py
index 94529f42ff..72b40e0b8c 100644
--- a/src/unitxt/dataset.py
+++ b/src/unitxt/dataset.py
@@ -164,6 +164,11 @@ def as_dataset(
         """
         return (
             super()
-            .as_dataset(split, run_post_process, verification_mode, in_memory)
+            .as_dataset(
+                split=split,
+                run_post_process=run_post_process,
+                verification_mode=verification_mode,
+                in_memory=in_memory,
+            )
             .with_transform(loads_batch)
         )

From 0f500a0df6fc7fbdefa5a9972da8f1355c3e4083 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 10:49:12 +0300
Subject: [PATCH 03/22] fix: Migrate arena-hard card to
 lmarena-ai/arena-hard-viewer and fix WeightedWinRateCorrelation metric

The old lmsys/arena-hard-browser HF space is no longer available. This
migrates to the replacement space lmarena-ai/arena-hard-viewer with
adapted processing steps for its different data format (flat prompt
field, messages-based answers, uid instead of question_id).

Also fixes a bug in WeightedWinRateCorrelation where pd.DataFrame
columns initialized as object dtype caused scipy pearsonr to fail
with newer numpy/scipy versions.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 prepare/cards/arena_hard/common.py            | 24 +++++++++++-----
 .../both_games_gpt4_judge.py                  |  4 +--
 .../both_games_gpt_4_judge.json               |  4 +--
 .../arena_hard_hf_space_processing_steps.json | 28 ++++++++++++++++---
 src/unitxt/metrics.py                         |  6 ++--
 5 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/prepare/cards/arena_hard/common.py b/prepare/cards/arena_hard/common.py
index c7cb4567f6..d55f6772af 100644
--- a/prepare/cards/arena_hard/common.py
+++ b/prepare/cards/arena_hard/common.py
@@ -5,6 +5,7 @@
     Cast,
     Copy,
     FilterByCondition,
+    RemoveFields,
     Rename,
     SelectFields,
     Set,
@@ -18,18 +19,22 @@
 arena_hard_hf_space_processing_steps = SequentialOperator(
     steps=[
         # region Question file
-        Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
+        Rename(
+            field_to_field={"uid": "question_id", "cluster": "category"},
+            apply_to_streams=["questions"],
+        ),
         Copy(
-            field_to_field={"turns/0/content": "model_input"},
+            field_to_field={"prompt": "model_input"},
             apply_to_streams=["questions"],
         ),
         # endregion
         # region Answers file processing
+        Rename(
+            field_to_field={"uid": "question_id", "model": "model_id"},
+            apply_to_streams=["model_answer"],
+        ),
         Copy(
-            field_to_field={
-                "choices/0/turns/0/content": "model_output",
-                "choices/0/turns/0/token_len": "model_output_token_len",
-            },
+            field_to_field={"messages/1/content/answer": "model_output"},
             apply_to_streams=["model_answer"],
         ),
         Apply(
@@ -52,9 +57,14 @@
             apply_to_streams=["judgment"],
         ),
         Rename(
-            field_to_field={"model": "model_2", "judge": "judge_model_id"},
+            field_to_field={
+                "uid": "question_id",
+                "model": "model_2",
+                "judge": "judge_model_id",
+            },
             apply_to_streams=["judgment"],
         ),
+        RemoveFields(fields=["category"], apply_to_streams=["judgment"]),
         Set(fields={"model_1": "gpt-4-0314"}, apply_to_streams=["judgment"]),
         Cast(
             field="judge_input_model_1_ordered_first",
diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
index b572ec4cc1..b0cd78c6ab 100644
--- a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
+++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
@@ -15,8 +15,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
index 33e4f68eaf..31fc2dfc60 100644
--- a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
+++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
diff --git a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json
index fcfed9e6b3..882f53279c 100644
--- a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json
+++ b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json
@@ -4,7 +4,8 @@
         {
             "__type__": "rename",
             "field_to_field": {
-                "cluster": "group"
+                "uid": "question_id",
+                "cluster": "category"
             },
             "apply_to_streams": [
                 "questions"
@@ -13,17 +14,26 @@
         {
             "__type__": "copy",
             "field_to_field": {
-                "turns/0/content": "model_input"
+                "prompt": "model_input"
             },
             "apply_to_streams": [
                 "questions"
             ]
         },
+        {
+            "__type__": "rename",
+            "field_to_field": {
+                "uid": "question_id",
+                "model": "model_id"
+            },
+            "apply_to_streams": [
+                "model_answer"
+            ]
+        },
         {
             "__type__": "copy",
             "field_to_field": {
-                "choices/0/turns/0/content": "model_output",
-                "choices/0/turns/0/token_len": "model_output_token_len"
+                "messages/1/content/answer": "model_output"
             },
             "apply_to_streams": [
                 "model_answer"
@@ -57,6 +67,7 @@
         {
             "__type__": "rename",
             "field_to_field": {
+                "uid": "question_id",
                 "model": "model_2",
                 "judge": "judge_model_id"
             },
@@ -64,6 +75,15 @@
                 "judgment"
             ]
         },
+        {
+            "__type__": "remove_fields",
+            "fields": [
+                "category"
+            ],
+            "apply_to_streams": [
+                "judgment"
+            ]
+        },
         {
             "__type__": "set",
             "fields": {
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index bf88fc2a52..d87f152400 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -1929,10 +1929,12 @@ def compute(
             pred_df_win_rate, ref_df_win_rate, on="model", suffixes=("_pred", "_ref")
         )
         pearson_corr, _ = pearsonr(
-            merged_df["win_rate_pred"], merged_df["win_rate_ref"]
+            merged_df["win_rate_pred"].astype(float),
+            merged_df["win_rate_ref"].astype(float),
         )
         spearman_corr, _ = spearmanr(
-            merged_df["win_rate_pred"], merged_df["win_rate_ref"]
+            merged_df["win_rate_pred"].astype(float),
+            merged_df["win_rate_ref"].astype(float),
         )
 
         return {"pearson_corr": pearson_corr, "spearman_corr": spearman_corr}

From 9f9cf0909528a4904ed6f11c84d4878e3c626804 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 11:11:38 +0300
Subject: [PATCH 04/22] fix: Force float32 in Perplexity metric to prevent NaN
 with float16 models

Models like bloom-560M default to float16, causing numerical overflow
in attention computations with padded inputs. Forcing float32 ensures
stable perplexity scores regardless of model's default dtype.

Signed-off-by: Yoav Katz <yoavkatz@il.ibm.com>
Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/metrics.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index d87f152400..b59ff914f1 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -5105,7 +5105,11 @@ def __init__(self, model_name, single_token_mode):
             model_path = self.model_name
             if settings.hf_offline_models_path is not None:
                 model_path = os.path.join(settings.hf_offline_models_path, model_path)
-            self.model = self.model_class().from_pretrained(model_path).to(self.device)
+            self.model = (
+                self.model_class()
+                .from_pretrained(model_path, dtype=torch.float32)
+                .to(self.device)
+            )
             self.tokenizer = AutoTokenizer.from_pretrained(model_path)
             if self.tokenizer.pad_token_id is None:
                 self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

From 777fd44aa36cee77eef192ff1a68ef773347857b Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 12:18:12 +0300
Subject: [PATCH 05/22] fix: Remove run_post_process and verification_mode
 params for datasets>=4.8.5

The `datasets` library removed `run_post_process` and `verification_mode`
parameters from `DatasetBuilder.as_dataset()` in version 4.8.5. These
parameters were already non-functional in 4.8.4 (the `_post_process`
method and verification logic had been removed from the implementation),
but 4.8.5 cleaned up the signature to match, causing a TypeError.

- Remove `run_post_process=False` and `verification_mode="no_checks"`
  from the call site in api.py
- Remove both parameters from the Dataset.as_dataset() override signature
  and the super() call in dataset.py

No behavioral change: post-processing and verification were already
no-ops in recent datasets versions.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/api.py     |  4 +---
 src/unitxt/dataset.py | 15 +--------------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/unitxt/api.py b/src/unitxt/api.py
index 23de331bd4..b33db9dda2 100644
--- a/src/unitxt/api.py
+++ b/src/unitxt/api.py
@@ -221,9 +221,7 @@ def _source_to_dataset(
         if streaming:
             return ds_builder.as_streaming_dataset(split=split)
 
-        return ds_builder.as_dataset(
-            split=split, run_post_process=False, verification_mode="no_checks"
-        )
+        return ds_builder.as_dataset(split=split)
 
     except DatasetGenerationError as e:
         raise e.__cause__
diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py
index 72b40e0b8c..42b94dd068 100644
--- a/src/unitxt/dataset.py
+++ b/src/unitxt/dataset.py
@@ -126,8 +126,6 @@ def as_streaming_dataset(
     def as_dataset(
         self,
         split: Optional[datasets.Split] = None,
-        run_post_process=True,
-        verification_mode: Optional[Union[datasets.VerificationMode, str]] = None,
         in_memory=False,
     ) -> Union[datasets.Dataset, datasets.DatasetDict]:
         """Return a Dataset for the specified split.
@@ -135,12 +133,6 @@ def as_dataset(
         Args:
             split (`datasets.Split`):
                 Which subset of the data to return.
-            run_post_process (`bool`, defaults to `True`):
-                Whether to run post-processing dataset transforms and/or add
-                indexes.
-            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
-                Verification mode determining the checks to run on the
-                downloaded/processed dataset information (checksums/size/splits/...).
             in_memory (`bool`, defaults to `False`):
                 Whether to copy the data in-memory.
 
@@ -164,11 +156,6 @@ def as_dataset(
         """
         return (
             super()
-            .as_dataset(
-                split=split,
-                run_post_process=run_post_process,
-                verification_mode=verification_mode,
-                in_memory=in_memory,
-            )
+            .as_dataset(split=split, in_memory=in_memory)
             .with_transform(loads_batch)
         )

From 44f06a6d0f033b2e9814e93018ee7033bb97a834 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 12:38:53 +0300
Subject: [PATCH 06/22] fix: Replace assertWarns with catch_warnings to avoid
 transformers import bug

Python 3.10's assertWarns() iterates sys.modules and triggers transformers'
lazy loader to import aria.image_processing_aria which requires torchvision.
Using warnings.catch_warnings(record=True) avoids this module iteration.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 tests/library/test_artifact.py                   | 13 ++++++++++---
 tests/library/test_operators.py                  | 16 +++++++++++-----
 tests/library/test_unified_warnings_decorator.py |  8 ++++++--
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/tests/library/test_artifact.py b/tests/library/test_artifact.py
index b5b8f38581..c5113956d6 100644
--- a/tests/library/test_artifact.py
+++ b/tests/library/test_artifact.py
@@ -1,6 +1,7 @@
 import json
 import os
 import tempfile
+import warnings
 
 from unitxt.artifact import (
     Artifact,
@@ -346,13 +347,15 @@ def test_artifact_link_with_deprecation_warning(self):
                 overwrite=True,
             )
 
-            with self.assertWarns(DeprecationWarning):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
                 rename_fields = ArtifactLink(
                     to="rename.for.test.artifact.link",
                     __deprecated_msg__="Artifact is deprecated. "
                     "'rename.for.test.artifact.link' is now instantiated instead. "
                     "\nIn the future, please use 'rename.for.test.artifact.link'.",
                 )
+            self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w))
 
             add_to_catalog(
                 rename_fields,
@@ -361,8 +364,10 @@ def test_artifact_link_with_deprecation_warning(self):
                 overwrite=True,
             )
 
-            with self.assertWarns(DeprecationWarning):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
                 artifact, _ = fetch_artifact("renamefields.for.test.artifact.link")
+            self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w))
             self.assertDictEqual(rename.to_dict(), artifact.to_dict())
 
             # test again, now employing add_link_to_catalog()
@@ -374,8 +379,10 @@ def test_artifact_link_with_deprecation_warning(self):
                 overwrite=True,
             )
 
-            with self.assertWarns(DeprecationWarning):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
                 artifact = get_from_catalog("renamefields3")
+            self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w))
             self.assertDictEqual(rename.to_dict(), artifact.to_dict())
 
     def test_artifact_link_with_overwrites(self):
diff --git a/tests/library/test_operators.py b/tests/library/test_operators.py
index eac31852a8..ce07922e30 100644
--- a/tests/library/test_operators.py
+++ b/tests/library/test_operators.py
@@ -1,6 +1,7 @@
 import json
 import os
 import tempfile
+import warnings
 from collections import Counter
 from typing import Any
 
@@ -2334,12 +2335,17 @@ def test_rename(self):
             tester=self,
         )
 
-        with self.assertWarns(DeprecationWarning) as dw:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
             Rename(field_to_field={"a/b/c/d": "a/b/c/f"}, use_query=True)
-            self.assertEqual(
-                "Field 'use_query' is deprecated. From now on, default behavior is compatible to use_query=True. Please remove this field from your code.",
-                dw.warnings[0].message.args[0],
-            )
+        deprecation_warnings = [
+            x for x in w if issubclass(x.category, DeprecationWarning)
+        ]
+        self.assertTrue(len(deprecation_warnings) > 0)
+        self.assertEqual(
+            "Field 'use_query' is deprecated. From now on, default behavior is compatible to use_query=True. Please remove this field from your code.",
+            deprecation_warnings[0].message.args[0],
+        )
 
     def test_add(self):
         check_operator(
diff --git a/tests/library/test_unified_warnings_decorator.py b/tests/library/test_unified_warnings_decorator.py
index bd32f4da9c..90760addf6 100644
--- a/tests/library/test_unified_warnings_decorator.py
+++ b/tests/library/test_unified_warnings_decorator.py
@@ -1,4 +1,5 @@
 import unittest
+import warnings
 
 from unitxt.deprecation_utils import warn_on_call
 
@@ -10,10 +11,13 @@ class TestClass:
             def __init__(self, name):
                 self.name = name
 
-        with self.assertWarns(UserWarning) as warning_context:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
             obj = TestClass("Initialized_object")
 
-        self.assertEqual(str(warning_context.warning), "Class object initialized!")
+        user_warnings = [x for x in w if issubclass(x.category, UserWarning)]
+        self.assertTrue(len(user_warnings) > 0)
+        self.assertEqual(str(user_warnings[0].message), "Class object initialized!")
         self.assertEqual(obj.name, "Initialized_object")
 
     def test_warning_called_on_instance_creation(self):

From ee5909890905e8b62a9819dd9d46c04a40b3ca82 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 13:00:54 +0300
Subject: [PATCH 07/22] fix: Cap tokenizer model_max_length in BertScore to
 prevent OverflowError

The tokenizers Rust backend (>=0.22) now validates integer sizes, causing
DeBERTa's absurd model_max_length (1e30) to overflow. Use BERTScorer
directly and cap model_max_length to the model's max_position_embeddings.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/metrics.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index b59ff914f1..c7f22ea41a 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4429,13 +4429,33 @@ def prepare(self):
         super().prepare()
         self.bertscore = None
 
+    def _get_scorer(self):
+        from bert_score import BERTScorer
+
+        if self.bertscore is None:
+            self.bertscore = BERTScorer(
+                model_type=self.model_name,
+                num_layers=self.model_layer,
+                batch_size=self.batch_size,
+                device=self.get_device(),
+            )
+            # Some models (e.g. DeBERTa) report an absurdly large
+            # model_max_length that overflows the tokenizers Rust backend.
+            # Cap it to the model's actual max_position_embeddings.
+            tokenizer = self.bertscore._tokenizer
+            if tokenizer.model_max_length > 1_000_000:
+                from transformers import AutoConfig
+
+                config = AutoConfig.from_pretrained(self.model_name)
+                tokenizer.model_max_length = getattr(
+                    config, "max_position_embeddings", 512
+                )
+        return self.bertscore
+
     def map_stream(
         self, evaluation_inputs_stream: Generator[EvaluationInput[str], None, None]
     ):
-        from evaluate import load
-
-        if self.bertscore is None:
-            self.bertscore = load("bertscore", experiment_id=str(uuid.uuid4()))
+        scorer = self._get_scorer()
 
         predictions = []
         references = []
@@ -4443,18 +4463,15 @@ def map_stream(
             predictions.append(prediction)
             references.append(reference)
 
-        results = self.bertscore.compute(
-            predictions=predictions,
-            references=references,
+        (precisions, recalls, f1s) = scorer.score(
+            cands=predictions,
+            refs=references,
             batch_size=self.batch_size,
-            device=self.get_device(),
-            model_type=self.model_name,
-            num_layers=self.model_layer,
         )
 
         intermediates = []
         for precision, recall, f1 in zip(
-            results["precision"], results["recall"], results["f1"]
+            precisions.tolist(), recalls.tolist(), f1s.tolist()
         ):
             intermediates.append(
                 {

From 62c56e92493fadc437e85c2f0aedf0485fc717cb Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 16:23:43 +0300
Subject: [PATCH 08/22] fix: Migrate remaining arena-hard cards to
 lmarena-ai/arena-hard-viewer

Update first_game_only and both_games_mean_judgment cards to use the
new HF space, matching the migration done for both_games_gpt4_judge.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .../both_games_mean_judgment_gpt4_judge.py                    | 4 ++--
 .../pairwise_comparative_rating/first_game_only_gpt4_judge.py | 4 ++--
 .../both_games_mean_judgment_gpt4_judge.json                  | 4 ++--
 .../first_game_only_gpt_4_judge.json                          | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
index 9be246a750..c814bc4b23 100644
--- a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
+++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
@@ -16,8 +16,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
diff --git a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
index 82936196e1..e0ce78ee44 100644
--- a/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
+++ b/prepare/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
@@ -13,8 +13,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
index 8e44a52bd9..e9470a14e7 100644
--- a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
+++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
diff --git a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
index d5f928851b..d4160dcf7c 100644
--- a/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
+++ b/src/unitxt/catalog/cards/arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

From 01f7c4ff1897a0bd02028407e9e3c6a872164cec Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 18:30:41 +0300
Subject: [PATCH 09/22] fix: Log each preparation file at CRITICAL level for CI
 visibility

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 tests/catalog/test_preparation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/catalog/test_preparation.py b/tests/catalog/test_preparation.py
index 163fb883d2..b24a5f14f1 100644
--- a/tests/catalog/test_preparation.py
+++ b/tests/catalog/test_preparation.py
@@ -51,7 +51,7 @@ def test_preparations(self):
         for file in all_preparation_files:
             passed = True
             error = None
-            logger.info(
+            logger.critical(
                 "\n_____________________________________________\n"
                 f"  Testing preparation file:\n  {file}."
                 "\n_____________________________________________\n"

From 77defee0dca757953b1397e670d0b76479e8115c Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 20:28:35 +0300
Subject: [PATCH 10/22] fix: Enable verbose logging in BertScore to debug CI
 hang

BertScore scorer.score() hangs silently in CI. Adding verbose=True
will show per-batch progress to identify where it stalls.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/metrics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index c7f22ea41a..64ac09be19 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4467,6 +4467,7 @@ def map_stream(
             cands=predictions,
             refs=references,
             batch_size=self.batch_size,
+            verbose=True,
         )
 
         intermediates = []

From 7e352dd0473baeb3bb35790d7e64ecfeb6af3126 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 May 2026 21:01:48 +0300
Subject: [PATCH 11/22] fix: Disable test_card for numeric_nlg and coqa to
 unblock CI

Commenting out test_card calls that hang in CI to determine if the
issue is specific to these cards or affects all BertScore usage.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 prepare/cards/coqa.py        | 5 ++---
 prepare/cards/numeric_nlg.py | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/prepare/cards/coqa.py b/prepare/cards/coqa.py
index 33675e42f9..bc38b6e194 100644
--- a/prepare/cards/coqa.py
+++ b/prepare/cards/coqa.py
@@ -3,7 +3,6 @@
 from unitxt.collections_operators import Dictify, DuplicateBySubLists, Get, Wrap
 from unitxt.dialog_operators import SerializeDialog
 from unitxt.operators import Copy, ZipFieldValues
-from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(path="stanfordnlp/coqa"),
@@ -58,7 +57,7 @@
     ),
 )
 
-test_card(card)
+# test_card(card)
 add_to_catalog(card, "cards.coqa.qa", overwrite=True)
 
 card = TaskCard(
@@ -106,5 +105,5 @@
     ),
 )
 
-test_card(card)
+# test_card(card)
 add_to_catalog(card, "cards.coqa.completion", overwrite=True)
diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py
index 47438e53da..f5290edf70 100644
--- a/prepare/cards/numeric_nlg.py
+++ b/prepare/cards/numeric_nlg.py
@@ -7,7 +7,6 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.operators import Copy
-from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(path="kasnerz/numericnlg"),
@@ -37,5 +36,5 @@
     },
 )
 
-test_card(card, num_demos=2, demos_pool_size=5, strict=False)
+# test_card(card, num_demos=2, demos_pool_size=5, strict=False)
 add_to_catalog(card, "cards.numeric_nlg", overwrite=True)

From c85d02de1cc8531b50f1a13b7aedaf74a24ef707 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Tue, 19 May 2026 15:39:02 +0300
Subject: [PATCH 12/22] debug: Enable progress bars in catalog_preparation CI
 to diagnose hang

Remove TQDM_DISABLE and HF_DATASETS_DISABLE_PROGRESS_BARS so we can
see download progress and identify where ffqa_filtered stalls.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index 843fb06dd5..749c4f130b 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -20,10 +20,8 @@ jobs:
       UNITXT_DEFAULT_VERBOSITY: error
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
-      HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
       HF_HUB_DOWNLOAD_TIMEOUT: 60
       HF_HUB_ETAG_TIMEOUT: 60
-      TQDM_DISABLE: "True"
       HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:

From 4534314d5c426c0c69b5fa832c3edfa4356fe53d Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Tue, 19 May 2026 16:36:57 +0300
Subject: [PATCH 13/22] fix: Disable test_card for ffqa_filtered to unblock CI

The abacusai/WikiQA-Free_Form_QA dataset's builder script hangs
indefinitely on GitHub Actions runners, causing the 30-minute
timeout to trigger.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 prepare/cards/ffqa_filtered.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py
index 93683b7a6a..bf5d5c4236 100644
--- a/prepare/cards/ffqa_filtered.py
+++ b/prepare/cards/ffqa_filtered.py
@@ -11,7 +11,6 @@
     ListFieldValues,
     Set,
 )
-from unitxt.test_utils.card import test_card
 
 """Filtered version of the WikiQA-Free_Form_QA dataset.
 If you would like to use the full dataset, please copy and modify this card as ffqa.py.
@@ -119,7 +118,7 @@ def add_card(split: str):
         ),
     )
 
-    test_card(card)
+    # test_card(card)
     add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True)
 
 

From 5b8da918c4dc3f775770f15f4893daf3c043ed03 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Tue, 19 May 2026 17:27:23 +0300
Subject: [PATCH 14/22] fix: Increase catalog_preparation partitions to 10 and
 re-enable progress suppression

Increase parallelism from 8 to 10 partitions to reduce per-job load
and re-enable TQDM_DISABLE/HF_DATASETS_DISABLE_PROGRESS_BARS since
they didn't help diagnose the hang.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index 749c4f130b..d99d7ac509 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -20,13 +20,15 @@ jobs:
       UNITXT_DEFAULT_VERBOSITY: error
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
+      HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
       HF_HUB_DOWNLOAD_TIMEOUT: 60
       HF_HUB_ETAG_TIMEOUT: 60
+      TQDM_DISABLE: "True"
       HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:
       matrix:
-        modulo: [0,1,2,3,4,5,6,7]
+        modulo: [0,1,2,3,4,5,6,7,8,9]
 
     steps:
     - uses: actions/checkout@v5
@@ -47,7 +49,7 @@ jobs:
       run: |
         modulo="${{ matrix.modulo }}"
         echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
-        echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh
+        echo "sed -i 's/^num_par = 1 /num_par = 10 /' tests/catalog/test_preparation.py" > sedit.sh
         echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh
         sh sedit.sh
         python -m unittest tests.catalog.test_preparation

From e2177a5e9c8c84d326c8e0513e168655bd491820 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Tue, 19 May 2026 17:53:16 +0300
Subject: [PATCH 15/22] debug: Remove pip cache from catalog_preparation to
 test if it affects hang

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index d99d7ac509..8d2d94ba93 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -36,7 +36,6 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: '3.10'
-        cache: 'pip'
 
     - name: Install Dependencies
       run: bash utils/install.sh

From a3635166ccbd266254f9355df4cd2e3cb5bb4661 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 09:20:50 +0300
Subject: [PATCH 16/22] debug: Limit catalog_preparation to 4 parallel jobs to
 test HF rate limiting

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index 8d2d94ba93..70fb060a2d 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -27,6 +27,7 @@ jobs:
       HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:
+      max-parallel: 4
       matrix:
         modulo: [0,1,2,3,4,5,6,7,8,9]
 

From b592cd1b443bb81523d660a972e2bf891392e05a Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 10:06:31 +0300
Subject: [PATCH 17/22] debug: Add PYTHONUNBUFFERED=1 to see output before hang

Without this, stdout is fully buffered in CI and gets lost when the
process is killed by the 30-minute timeout.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index 70fb060a2d..bfcbd3705c 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -17,6 +17,7 @@ jobs:
     timeout-minutes: 30
     env:
       OS: ubuntu-latest
+      PYTHONUNBUFFERED: "1"
       UNITXT_DEFAULT_VERBOSITY: error
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error

From fcbfb434f4973a0bd663fa93e032facb88a3d497 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 10:07:26 +0300
Subject: [PATCH 18/22] revert: Remove max-parallel limit since it didn't help

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index bfcbd3705c..71d5f75c3b 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -28,7 +28,6 @@ jobs:
       HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:
-      max-parallel: 4
       matrix:
         modulo: [0,1,2,3,4,5,6,7,8,9]
 

From 6616199609d728f36d7e32f5b650f68acb938065 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 11:17:09 +0300
Subject: [PATCH 19/22] fix: Update mtrag card URL after repo restructure

The IBM/mt-rag-benchmark repo moved human/generation_tasks/ to
mtrag-human/generation_tasks/.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 prepare/cards/mtrag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare/cards/mtrag.py b/prepare/cards/mtrag.py
index 92126f18f1..fa4f0b3b67 100644
--- a/prepare/cards/mtrag.py
+++ b/prepare/cards/mtrag.py
@@ -20,7 +20,7 @@
 card = TaskCard(
     loader=LoadJsonFile(
         files={
-            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
+            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl"
         },
         lines=True,
         data_classification_policy=["public"],

From 995ca6daf83717c269e2c3f484276c7139f396cc Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 11:18:31 +0300
Subject: [PATCH 20/22] fix: Restore pip cache in catalog_preparation workflow

Removing it was a debug experiment that didn't help.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 .github/workflows/catalog_preparation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index 71d5f75c3b..9c02481685 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -37,6 +37,7 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: '3.10'
+        cache: 'pip'
 
     - name: Install Dependencies
       run: bash utils/install.sh

From 8e098b72e5d86cf74b83fcca64eff4e011aefa74 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 11:33:31 +0300
Subject: [PATCH 21/22] fix: Update mtrag catalog JSON with new URL

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/catalog/cards/rag/mtrag.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/catalog/cards/rag/mtrag.json b/src/unitxt/catalog/cards/rag/mtrag.json
index 6165500252..3926b37a6f 100644
--- a/src/unitxt/catalog/cards/rag/mtrag.json
+++ b/src/unitxt/catalog/cards/rag/mtrag.json
@@ -3,7 +3,7 @@
     "loader": {
         "__type__": "load_json_file",
         "files": {
-            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
+            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl"
         },
         "lines": true,
         "data_classification_policy": [

From 2cd9877fd2d92187039db7a0646ea232d02da86c Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Wed, 20 May 2026 12:08:21 +0300
Subject: [PATCH 22/22] fix: Re-enable test_card for numeric_nlg, coqa, and
 ffqa_filtered

All three pass locally. The CI hangs are due to network issues on
GitHub Actions runners, not code problems.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 prepare/cards/coqa.py          | 5 +++--
 prepare/cards/ffqa_filtered.py | 3 ++-
 prepare/cards/numeric_nlg.py   | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/prepare/cards/coqa.py b/prepare/cards/coqa.py
index bc38b6e194..33675e42f9 100644
--- a/prepare/cards/coqa.py
+++ b/prepare/cards/coqa.py
@@ -3,6 +3,7 @@
 from unitxt.collections_operators import Dictify, DuplicateBySubLists, Get, Wrap
 from unitxt.dialog_operators import SerializeDialog
 from unitxt.operators import Copy, ZipFieldValues
+from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(path="stanfordnlp/coqa"),
@@ -57,7 +58,7 @@
     ),
 )
 
-# test_card(card)
+test_card(card)
 add_to_catalog(card, "cards.coqa.qa", overwrite=True)
 
 card = TaskCard(
@@ -105,5 +106,5 @@
     ),
 )
 
-# test_card(card)
+test_card(card)
 add_to_catalog(card, "cards.coqa.completion", overwrite=True)
diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py
index bf5d5c4236..93683b7a6a 100644
--- a/prepare/cards/ffqa_filtered.py
+++ b/prepare/cards/ffqa_filtered.py
@@ -11,6 +11,7 @@
     ListFieldValues,
     Set,
 )
+from unitxt.test_utils.card import test_card
 
 """Filtered version of the WikiQA-Free_Form_QA dataset.
 If you would like to use the full dataset, please copy and modify this card as ffqa.py.
@@ -118,7 +119,7 @@ def add_card(split: str):
         ),
     )
 
-    # test_card(card)
+    test_card(card)
     add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True)
 
 
diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py
index f5290edf70..47438e53da 100644
--- a/prepare/cards/numeric_nlg.py
+++ b/prepare/cards/numeric_nlg.py
@@ -7,6 +7,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.operators import Copy
+from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(path="kasnerz/numericnlg"),
@@ -36,5 +37,5 @@
     },
 )
 
-# test_card(card, num_demos=2, demos_pool_size=5, strict=False)
+test_card(card, num_demos=2, demos_pool_size=5, strict=False)
 add_to_catalog(card, "cards.numeric_nlg", overwrite=True)