Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions benchmark_tasks/new_reason/new_reason.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
dataset_path: MERA-evaluation/new_reason
task: new_reason

test_split: test
fewshot_split: test

output_type: generate_until

doc_to_text: !function utils.doc_to_text
doc_to_target: "{{ outputs.strip() }}"


fewshot_config:
sampler: default

generation_kwargs:
do_sample: false

filter_list:
- name: "scoring"
filter:
- function: remove_whitespace_and_nones
- function: "take_first"

process_results: !function utils.process_results

metric_list:
- metric: em
aggregation: mean
higher_is_better: true

num_fewshot: null

metadata:
version: 1.0
39 changes: 39 additions & 0 deletions benchmark_tasks/new_reason/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Dict, List, Any
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
from transformers.data.metrics import squad_metrics


def doc_to_text(doc: Dict[str, Any]) -> str:

return doc["instruction"].format(**doc["inputs"])


def process_results(doc: Dict, results: List[str]) -> Dict:
if len(doc["outputs"]) > 0:
gold_label = doc["outputs"]
pred_label = results[0]

em = squad_metrics.compute_exact(gold_label, pred_label)

return {"em": em}
return {"em": 0}


@register_filter("remove_whitespace_and_nones")
class RemoveWhitespaceAndNones(Filter):

def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
def filter_set(inst):
filtered_resp = []
for resp in inst:
if not resp:
resp = ""
else:
resp = resp.lstrip()
filtered_resp.append(resp)
return filtered_resp

filtered_resps = [filter_set(resp) for resp in resps]

return filtered_resps