Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[submodule "lm-evaluation-harness"]
path = lm-evaluation-harness
url = https://github.com/MERA-Evaluation/lm-evaluation-harness.git
branch = mera_text
url = https://github.com/artemorloff/lm-evaluation-harness.git
branch = rutie_text
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ etc.

**🚀 How to Contribute?**

0. Develop your dataset according to the text LLM evaluation criteria ([see requirements](docs/dataset_review.md)).
1. Format your dataset to our specifications ([format instruction](docs/dataset_formatting.md)) and upload it to the 🤗 Hugging Face Hub ([instruction](docs/dataset_hf.md](docs/dataset_formatting.md))).
2. Integrate your dataset into our codebase using the instructions above. Check that it works by running the baselines! ([instruction](docs/task_codebase.md)).
3. Submit a **Pull Request** with your dataset to this repository.
0. Submit a **Pull Request** with the dataset description to this repository ([instruction](docs/how_to_add_dataset.md)).
1. Develop your dataset according to the text LLM evaluation criteria ([see requirements](docs/dataset_review.md)).
2. Format your dataset to our specifications ([format instruction](docs/dataset_formatting.md)) and upload it to the 🤗 Hugging Face Hub ([instruction](docs/dataset_hf.md](docs/dataset_formatting.md))).
3. Integrate your dataset into our codebase using the instructions above. Check that it works by running the baselines! ([instruction](docs/task_codebase.md)).

We will review your submission and, upon approval, add it to New MERA TEXT.

Expand Down
4 changes: 4 additions & 0 deletions benchmark_tasks/custom_openjudge_localscore_task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: ./custom_openjudge_task.yaml
tag:
- mera_openjudge
- mera_openjudge_local
3 changes: 3 additions & 0 deletions benchmark_tasks/custom_openjudge_task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include: ./custom_generate_task.yaml
tag:
- mera_openjudge
2 changes: 1 addition & 1 deletion benchmark_tasks/custom_samplers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import warnings
from typing import Optional

eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)


class FewshotSampler(ContextSampler):
Expand Down
85 changes: 85 additions & 0 deletions benchmark_tasks/openjudge_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
from pathlib import Path

try:
from lm_eval.api.filter import Filter
from lm_eval.api.registry import FILTER_REGISTRY, register_filter
except ModuleNotFoundError:
FILTER_REGISTRY = {}

class Filter: # type: ignore[no-redef]
pass

def register_filter(filter_name): # type: ignore[no-redef]
def decorator(cls):
FILTER_REGISTRY[filter_name] = cls
return cls

return decorator

from mera_openjudge import OpenJudgeScorer, load_judge_config


def _load_runtime_config_from_env():
return {
"api_key": os.environ.get("MERA_JUDGE_API_KEY", ""),
"backend": "openai_compatible",
"base_url": os.environ.get("MERA_JUDGE_BASE_URL", ""),
"max_new_tokens": int(os.environ.get("MERA_JUDGE_MAX_NEW_TOKENS", "128")),
"model_name": os.environ.get("MERA_JUDGE_MODEL", "ai-forever/pollux-judge-7b"),
"temperature": float(os.environ.get("MERA_JUDGE_TEMPERATURE", "0.1")),
"timeout": int(os.environ.get("MERA_JUDGE_TIMEOUT", "120")),
}


def _judge_config_path(task_dir):
return str(Path(task_dir) / "judge.yaml")


def build_process_results(task_dir):
judge_config = load_judge_config(_judge_config_path(task_dir))
zero_metrics = {
judge_config.metric_name: 0.0,
**{f"judge_{criterion.key}": 0.0 for criterion in judge_config.criteria},
}

def process_results(doc, results):
del doc
if not results or not results[0]:
return dict(zero_metrics)
return dict(results[0])

return process_results


def register_openjudge_filter(filter_name, task_dir):
if FILTER_REGISTRY.get(filter_name, None):
return filter_name

judge_config_path = _judge_config_path(task_dir)

@register_filter(filter_name)
class OpenJudgeScoring(Filter):
def __init__(self) -> None:
self.judge_config_path = judge_config_path
self.runtime_config = _load_runtime_config_from_env()
self._scorer = None

def _get_scorer(self):
if self._scorer is None:
self._scorer = OpenJudgeScorer(
judge_config=self.judge_config_path,
runtime_config=self.runtime_config,
)
return self._scorer

def apply(self, resps, docs):
answers = []
for sample in resps:
answer = ""
if sample:
answer = str(sample[0]).strip()
answers.append(answer)
return [[metrics] for metrics in self._get_scorer().score_answers(docs, answers)]

return filter_name
21 changes: 21 additions & 0 deletions benchmark_tasks/pollux_instructions_example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## POLLUX Instructions Example

This directory contains a minimal open-generation example built from the
`ai-forever/POLLUX-instructions` dataset.

Source prompt:
- dataset: `ai-forever/POLLUX-instructions`
- split: `train`
- `prompt_id`: `0`
- instruction:
`Составь мне план научного доклада об измерении содержания метана в испарениях над морем Лаптевых.`

Source criterion:
- dataset: `ai-forever/POLLUX-criteria`
- task subtype: `Составить план текста`
- domain: `Научный`
- criterion name: `Глубина проработки ответа`

The example keeps only one criterion on purpose so that the task stays short and
readable in the repository. Real tasks should usually copy the full criteria set
from `POLLUX-instructions`.
9 changes: 9 additions & 0 deletions benchmark_tasks/pollux_instructions_example/judge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
criteria:
- key: depth
name: Глубина проработки ответа
scale:
0: Модель выполнила запрос, но ответ поверхностный, неглубокий, с очень слабой степенью проработки.
1: Модель демонстрирует приемлемую степень проработки, но ответу не хватает глубины и/или детализации.
2: Модель выдает отличный, проработанный ответ. Ответ демонстрирует хорошее раскрытие темы и достаточную детализацию.
metric_name: judge_avg
reference_field: meta.reference_answer
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
include: ../custom_openjudge_task.yaml
tag:
- mera_openjudge_example
task: pollux_instructions_example
dataset_path: ai-forever/POLLUX-instructions
dataset_name: default
test_split: train
process_docs: !function utils.process_docs
doc_to_text: "{{instruction}}"
generation_kwargs:
do_sample: false
process_results: !function utils.process_results
filter_list:
- name: "scoring"
filter:
- function: polluxinstructionsexamplescoring
metric_list:
- metric: judge_avg
aggregation: mean
higher_is_better: true
- metric: judge_depth
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
include: pollux_instructions_example.yaml
tag:
- mera_openjudge_example_local
task: pollux_instructions_example_localscore
test_split: train
38 changes: 38 additions & 0 deletions benchmark_tasks/pollux_instructions_example/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from pathlib import Path

import datasets

from benchmark_tasks.openjudge_utils import build_process_results, register_openjudge_filter


TASK_DIR = Path(__file__).resolve().parent
FILTER_NAME = register_openjudge_filter("polluxinstructionsexamplescoring", TASK_DIR)
EXAMPLE_PROMPT_IDS = {0}


def _process_doc(doc):
return {
"instruction": doc["instruction"],
"inputs": "",
"outputs": doc.get("reference_answer", "") or "",
"meta": {
"id": int(doc["prompt_id"]),
"reference_answer": doc.get("reference_answer", "") or "",
"source_dataset": "ai-forever/POLLUX-instructions",
"source_prompt_id": int(doc["prompt_id"]),
"task_type": doc.get("task_type", ""),
"task_subtype": doc.get("task_subtype", ""),
"task_subsubtype": doc.get("task_subsubtype", ""),
"difficulty": doc.get("difficulty", ""),
"domain": doc.get("domain", ""),
},
}


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
filtered = dataset.filter(lambda doc: doc["prompt_id"] in EXAMPLE_PROMPT_IDS)
return filtered.map(_process_doc)


process_results = build_process_results(TASK_DIR)

44 changes: 23 additions & 21 deletions benchmark_tasks/rucodeeval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
from lm_eval.api.registry import FILTER_REGISTRY


def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
Expand All @@ -40,27 +41,28 @@ def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
} # if no label provided (test answers are secret)


@register_filter("ruhumanevalscoring")
class ruHumanEvalScoring(Filter):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""

def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
# resps: List[List[str]] - list of lists of generations
code_results = []
for idx, sample in enumerate(resps):
sample_metrics = []
for completion in sample:
processed_completion = preprocess_generation(completion)
result = execute_function(processed_completion, docs[idx]) # List
sample_metrics.extend([result])
code_results.extend([sample_metrics])
return code_results
if not FILTER_REGISTRY.get("ruhumanevalscoring", None):
@register_filter("ruhumanevalscoring")
class ruHumanEvalScoring(Filter):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""

def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
# resps: List[List[str]] - list of lists of generations
code_results = []
for idx, sample in enumerate(resps):
sample_metrics = []
for completion in sample:
processed_completion = preprocess_generation(completion)
result = execute_function(processed_completion, docs[idx]) # List
sample_metrics.extend([result])
code_results.extend([sample_metrics])
return code_results


def preprocess_generation(generation):
Expand Down
44 changes: 23 additions & 21 deletions benchmark_tasks/ruhumaneval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
from lm_eval.api.registry import FILTER_REGISTRY


def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
Expand All @@ -40,27 +41,28 @@ def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
} # if no label provided (test answers are secret)


@register_filter("ruhumanevalscoring")
class ruHumanEvalScoring(Filter):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""

def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
# resps: List[List[str]] - list of lists of generations
code_results = []
for idx, sample in enumerate(resps):
sample_metrics = []
for completion in sample:
processed_completion = preprocess_generation(completion)
result = execute_function(processed_completion, docs[idx]) # List
sample_metrics.extend([result])
code_results.extend([sample_metrics])
return code_results
if not FILTER_REGISTRY.get("ruhumanevalscoring", None):
@register_filter("ruhumanevalscoring")
class ruHumanEvalScoring(Filter):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""

def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
# resps: List[List[str]] - list of lists of generations
code_results = []
for idx, sample in enumerate(resps):
sample_metrics = []
for completion in sample:
processed_completion = preprocess_generation(completion)
result = execute_function(processed_completion, docs[idx]) # List
sample_metrics.extend([result])
code_results.extend([sample_metrics])
return code_results


def preprocess_generation(generation):
Expand Down
4 changes: 2 additions & 2 deletions benchmark_tasks/rutie/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def _update_request(storage, request):

# when string passed (everywhere except for API calls)
if isinstance(request.arguments[0], str):
new_req = replace_targets(request.arguments[0], max_num, storage)
new_req = replace_targets(request.arguments[0], max_num, storage).replace("{context}", "")
request.arguments = (new_req, request.arguments[1])
else:
new_req = replace_targets(request.arguments[0].prompt, max_num, storage)
new_req = replace_targets(request.arguments[0].prompt, max_num, storage).replace("{context}", "")
new_req = JsonChatStr(new_req)
request.arguments = (new_req, request.arguments[1])

Expand Down
Loading