MERA-Evaluation · Alex-ast7 · Mar 10, 2026
diff --git a/benchmark_tasks/new_reason/new_reason.yaml b/benchmark_tasks/new_reason/new_reason.yaml
@@ -0,0 +1,35 @@
+dataset_path: MERA-evaluation/new_reason
+task: new_reason
+
+test_split: test
+fewshot_split: test
+
+output_type: generate_until
+
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{ outputs.strip() }}"
+
+
+fewshot_config:
+  sampler: default
+
+generation_kwargs:
+  do_sample: false
+
+filter_list:
+  - name: "scoring"
+    filter:
+      - function: remove_whitespace_and_nones
+      - function: "take_first"
+
+process_results: !function utils.process_results
+
+metric_list:
+  - metric: em
+    aggregation: mean
+    higher_is_better: true
+
+num_fewshot: null
+
+metadata:
+  version: 1.0
diff --git a/benchmark_tasks/new_reason/utils.py b/benchmark_tasks/new_reason/utils.py
@@ -0,0 +1,39 @@
+from typing import Dict, List, Any
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+from transformers.data.metrics import squad_metrics
+
+
+def doc_to_text(doc: Dict[str, Any]) -> str:
+
+    return doc["instruction"].format(**doc["inputs"])
+
+
+def process_results(doc: Dict, results: List[str]) -> Dict:
+    if len(doc["outputs"]) > 0:
+        gold_label = doc["outputs"]
+        pred_label = results[0]
+
+        em = squad_metrics.compute_exact(gold_label, pred_label)
+
+        return {"em": em}
+    return {"em": 0}
+
+
+@register_filter("remove_whitespace_and_nones")
+class RemoveWhitespaceAndNones(Filter):
+
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        def filter_set(inst):
+            filtered_resp = []
+            for resp in inst:
+                if not resp:
+                    resp = ""
+                else:
+                    resp = resp.lstrip()
+                filtered_resp.append(resp)
+            return filtered_resp
+
+        filtered_resps = [filter_set(resp) for resp in resps]
+
+        return filtered_resps