diff --git a/benchmark_tasks/new_reason/new_reason.yaml b/benchmark_tasks/new_reason/new_reason.yaml new file mode 100644 index 0000000..380698e --- /dev/null +++ b/benchmark_tasks/new_reason/new_reason.yaml @@ -0,0 +1,35 @@ +dataset_path: MERA-evaluation/new_reason +task: new_reason + +test_split: test +fewshot_split: test + +output_type: generate_until + +doc_to_text: !function utils.doc_to_text +doc_to_target: "{{ outputs.strip() }}" + + +fewshot_config: + sampler: default + +generation_kwargs: + do_sample: false + +filter_list: + - name: "scoring" + filter: + - function: remove_whitespace_and_nones + - function: "take_first" + +process_results: !function utils.process_results + +metric_list: + - metric: em + aggregation: mean + higher_is_better: true + +num_fewshot: null + +metadata: + version: 1.0 \ No newline at end of file diff --git a/benchmark_tasks/new_reason/utils.py b/benchmark_tasks/new_reason/utils.py new file mode 100644 index 0000000..6fee0ab --- /dev/null +++ b/benchmark_tasks/new_reason/utils.py @@ -0,0 +1,39 @@ +from typing import Dict, List, Any +from lm_eval.api.filter import Filter +from lm_eval.api.registry import register_filter +from transformers.data.metrics import squad_metrics + + +def doc_to_text(doc: Dict[str, Any]) -> str: + + return doc["instruction"].format(**doc["inputs"]) + + +def process_results(doc: Dict, results: List[str]) -> Dict: + if len(doc["outputs"]) > 0: + gold_label = doc["outputs"] + pred_label = results[0] + + em = squad_metrics.compute_exact(gold_label, pred_label) + + return {"em": em} + return {"em": 0} + + +@register_filter("remove_whitespace_and_nones") +class RemoveWhitespaceAndNones(Filter): + + def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]: + def filter_set(inst): + filtered_resp = [] + for resp in inst: + if not resp: + resp = "" + else: + resp = resp.lstrip() + filtered_resp.append(resp) + return filtered_resp + + filtered_resps = [filter_set(resp) for resp in resps] + + return filtered_resps \ No newline at end of file