MERA-Evaluation · danil31219as · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "lm-evaluation-harness"]
 	path = lm-evaluation-harness
-	url = https://github.com/MERA-Evaluation/lm-evaluation-harness.git
-	branch = mera_text
+	url = https://github.com/artemorloff/lm-evaluation-harness.git
+	branch = rutie_text
diff --git a/README.md b/README.md
@@ -44,10 +44,10 @@ etc.
 
 **🚀 How to Contribute?**
 
-0. Develop your dataset according to the text LLM evaluation criteria ([see requirements](docs/dataset_review.md)).
-1. Format your dataset to our specifications ([format instruction](docs/dataset_formatting.md)) and upload it to the 🤗 Hugging Face Hub ([instruction](docs/dataset_hf.md](docs/dataset_formatting.md))). 
-2.  Integrate your dataset into our codebase using the instructions above. Check that it works by running the baselines! ([instruction](docs/task_codebase.md)).
-3. Submit a **Pull Request** with your dataset to this repository.
+0. Submit a **Pull Request** with the dataset description to this repository ([instruction](docs/how_to_add_dataset.md)).
+1. Develop your dataset according to the text LLM evaluation criteria ([see requirements](docs/dataset_review.md)).
+2. Format your dataset to our specifications ([format instruction](docs/dataset_formatting.md)) and upload it to the 🤗 Hugging Face Hub ([instruction](docs/dataset_hf.md](docs/dataset_formatting.md))). 
+3.  Integrate your dataset into our codebase using the instructions above. Check that it works by running the baselines! ([instruction](docs/task_codebase.md)).
 
 We will review your submission and, upon approval, add it to New MERA TEXT.
 

diff --git a/benchmark_tasks/custom_openjudge_localscore_task.yaml b/benchmark_tasks/custom_openjudge_localscore_task.yaml
@@ -0,0 +1,4 @@
+include: ./custom_openjudge_task.yaml
+tag:
+  - mera_openjudge
+  - mera_openjudge_local
diff --git a/benchmark_tasks/custom_openjudge_task.yaml b/benchmark_tasks/custom_openjudge_task.yaml
@@ -0,0 +1,3 @@
+include: ./custom_generate_task.yaml
+tag:
+  - mera_openjudge
diff --git a/benchmark_tasks/custom_samplers.py b/benchmark_tasks/custom_samplers.py
@@ -4,7 +4,7 @@
 import warnings
 from typing import Optional
 
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 
 
 class FewshotSampler(ContextSampler):

diff --git a/benchmark_tasks/openjudge_utils.py b/benchmark_tasks/openjudge_utils.py
@@ -0,0 +1,85 @@
+import os
+from pathlib import Path
+
+try:
+    from lm_eval.api.filter import Filter
+    from lm_eval.api.registry import FILTER_REGISTRY, register_filter
+except ModuleNotFoundError:
+    FILTER_REGISTRY = {}
+
+    class Filter:  # type: ignore[no-redef]
+        pass
+
+    def register_filter(filter_name):  # type: ignore[no-redef]
+        def decorator(cls):
+            FILTER_REGISTRY[filter_name] = cls
+            return cls
+
+        return decorator
+
+from mera_openjudge import OpenJudgeScorer, load_judge_config
+
+
+def _load_runtime_config_from_env():
+    return {
+        "api_key": os.environ.get("MERA_JUDGE_API_KEY", ""),
+        "backend": "openai_compatible",
+        "base_url": os.environ.get("MERA_JUDGE_BASE_URL", ""),
+        "max_new_tokens": int(os.environ.get("MERA_JUDGE_MAX_NEW_TOKENS", "128")),
+        "model_name": os.environ.get("MERA_JUDGE_MODEL", "ai-forever/pollux-judge-7b"),
+        "temperature": float(os.environ.get("MERA_JUDGE_TEMPERATURE", "0.1")),
+        "timeout": int(os.environ.get("MERA_JUDGE_TIMEOUT", "120")),
+    }
+
+
+def _judge_config_path(task_dir):
+    return str(Path(task_dir) / "judge.yaml")
+
+
+def build_process_results(task_dir):
+    judge_config = load_judge_config(_judge_config_path(task_dir))
+    zero_metrics = {
+        judge_config.metric_name: 0.0,
+        **{f"judge_{criterion.key}": 0.0 for criterion in judge_config.criteria},
+    }
+
+    def process_results(doc, results):
+        del doc
+        if not results or not results[0]:
+            return dict(zero_metrics)
+        return dict(results[0])
+
+    return process_results
+
+
+def register_openjudge_filter(filter_name, task_dir):
+    if FILTER_REGISTRY.get(filter_name, None):
+        return filter_name
+
+    judge_config_path = _judge_config_path(task_dir)
+
+    @register_filter(filter_name)
+    class OpenJudgeScoring(Filter):
+        def __init__(self) -> None:
+            self.judge_config_path = judge_config_path
+            self.runtime_config = _load_runtime_config_from_env()
+            self._scorer = None
+
+        def _get_scorer(self):
+            if self._scorer is None:
+                self._scorer = OpenJudgeScorer(
+                    judge_config=self.judge_config_path,
+                    runtime_config=self.runtime_config,
+                )
+            return self._scorer
+
+        def apply(self, resps, docs):
+            answers = []
+            for sample in resps:
+                answer = ""
+                if sample:
+                    answer = str(sample[0]).strip()
+                answers.append(answer)
+            return [[metrics] for metrics in self._get_scorer().score_answers(docs, answers)]
+
+    return filter_name
diff --git a/benchmark_tasks/pollux_instructions_example/README.md b/benchmark_tasks/pollux_instructions_example/README.md
@@ -0,0 +1,21 @@
+## POLLUX Instructions Example
+
+This directory contains a minimal open-generation example built from the
+`ai-forever/POLLUX-instructions` dataset.
+
+Source prompt:
+- dataset: `ai-forever/POLLUX-instructions`
+- split: `train`
+- `prompt_id`: `0`
+- instruction:
+  `Составь мне план научного доклада об измерении содержания метана в испарениях над морем Лаптевых.`
+
+Source criterion:
+- dataset: `ai-forever/POLLUX-criteria`
+- task subtype: `Составить план текста`
+- domain: `Научный`
+- criterion name: `Глубина проработки ответа`
+
+The example keeps only one criterion on purpose so that the task stays short and
+readable in the repository. Real tasks should usually copy the full criteria set
+from `POLLUX-instructions`.
diff --git a/benchmark_tasks/pollux_instructions_example/judge.yaml b/benchmark_tasks/pollux_instructions_example/judge.yaml
@@ -0,0 +1,9 @@
+criteria:
+  - key: depth
+    name: Глубина проработки ответа
+    scale:
+      0: Модель выполнила запрос, но ответ поверхностный, неглубокий, с очень слабой степенью проработки.
+      1: Модель демонстрирует приемлемую степень проработки, но ответу не хватает глубины и/или детализации.
+      2: Модель выдает отличный, проработанный ответ. Ответ демонстрирует хорошее раскрытие темы и достаточную детализацию.
+metric_name: judge_avg
+reference_field: meta.reference_answer
diff --git a/benchmark_tasks/pollux_instructions_example/pollux_instructions_example.yaml b/benchmark_tasks/pollux_instructions_example/pollux_instructions_example.yaml
@@ -0,0 +1,25 @@
+include: ../custom_openjudge_task.yaml
+tag:
+  - mera_openjudge_example
+task: pollux_instructions_example
+dataset_path: ai-forever/POLLUX-instructions
+dataset_name: default
+test_split: train
+process_docs: !function utils.process_docs
+doc_to_text: "{{instruction}}"
+generation_kwargs:
+  do_sample: false
+process_results: !function utils.process_results
+filter_list:
+  - name: "scoring"
+    filter:
+      - function: polluxinstructionsexamplescoring
+metric_list:
+  - metric: judge_avg
+    aggregation: mean
+    higher_is_better: true
+  - metric: judge_depth
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/benchmark_tasks/pollux_instructions_example/pollux_instructions_example_localscore.yaml b/benchmark_tasks/pollux_instructions_example/pollux_instructions_example_localscore.yaml
@@ -0,0 +1,5 @@
+include: pollux_instructions_example.yaml
+tag:
+  - mera_openjudge_example_local
+task: pollux_instructions_example_localscore
+test_split: train
diff --git a/benchmark_tasks/pollux_instructions_example/utils.py b/benchmark_tasks/pollux_instructions_example/utils.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+import datasets
+
+from benchmark_tasks.openjudge_utils import build_process_results, register_openjudge_filter
+
+
+TASK_DIR = Path(__file__).resolve().parent
+FILTER_NAME = register_openjudge_filter("polluxinstructionsexamplescoring", TASK_DIR)
+EXAMPLE_PROMPT_IDS = {0}
+
+
+def _process_doc(doc):
+    return {
+        "instruction": doc["instruction"],
+        "inputs": "",
+        "outputs": doc.get("reference_answer", "") or "",
+        "meta": {
+            "id": int(doc["prompt_id"]),
+            "reference_answer": doc.get("reference_answer", "") or "",
+            "source_dataset": "ai-forever/POLLUX-instructions",
+            "source_prompt_id": int(doc["prompt_id"]),
+            "task_type": doc.get("task_type", ""),
+            "task_subtype": doc.get("task_subtype", ""),
+            "task_subsubtype": doc.get("task_subsubtype", ""),
+            "difficulty": doc.get("difficulty", ""),
+            "domain": doc.get("domain", ""),
+        },
+    }
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    filtered = dataset.filter(lambda doc: doc["prompt_id"] in EXAMPLE_PROMPT_IDS)
+    return filtered.map(_process_doc)
+
+
+process_results = build_process_results(TASK_DIR)
+
diff --git a/benchmark_tasks/rucodeeval/utils.py b/benchmark_tasks/rucodeeval/utils.py
@@ -14,6 +14,7 @@
 
 from lm_eval.api.filter import Filter
 from lm_eval.api.registry import register_filter
+from lm_eval.api.registry import FILTER_REGISTRY
 
 
 def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
@@ -40,27 +41,28 @@ def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
     }  # if no label provided (test answers are secret)
 
 
-@register_filter("ruhumanevalscoring")
-class ruHumanEvalScoring(Filter):
-    def __init__(self) -> None:
-        """
-        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
-        """
-
-    def apply(self, resps, docs):
-        """
-        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
-        """
-        # resps: List[List[str]] - list of lists of generations
-        code_results = []
-        for idx, sample in enumerate(resps):
-            sample_metrics = []
-            for completion in sample:
-                processed_completion = preprocess_generation(completion)
-                result = execute_function(processed_completion, docs[idx])  # List
-                sample_metrics.extend([result])
-            code_results.extend([sample_metrics])
-        return code_results
+if not FILTER_REGISTRY.get("ruhumanevalscoring", None):
+    @register_filter("ruhumanevalscoring")
+    class ruHumanEvalScoring(Filter):
+        def __init__(self) -> None:
+            """
+            Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+            """
+
+        def apply(self, resps, docs):
+            """
+            Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
+            """
+            # resps: List[List[str]] - list of lists of generations
+            code_results = []
+            for idx, sample in enumerate(resps):
+                sample_metrics = []
+                for completion in sample:
+                    processed_completion = preprocess_generation(completion)
+                    result = execute_function(processed_completion, docs[idx])  # List
+                    sample_metrics.extend([result])
+                code_results.extend([sample_metrics])
+            return code_results
 
 
 def preprocess_generation(generation):

diff --git a/benchmark_tasks/ruhumaneval/utils.py b/benchmark_tasks/ruhumaneval/utils.py
@@ -14,6 +14,7 @@
 
 from lm_eval.api.filter import Filter
 from lm_eval.api.registry import register_filter
+from lm_eval.api.registry import FILTER_REGISTRY
 
 
 def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
@@ -40,27 +41,28 @@ def process_results(doc: Dict, results: List[str]) -> Dict[str, float]:
     }  # if no label provided (test answers are secret)
 
 
-@register_filter("ruhumanevalscoring")
-class ruHumanEvalScoring(Filter):
-    def __init__(self) -> None:
-        """
-        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
-        """
-
-    def apply(self, resps, docs):
-        """
-        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
-        """
-        # resps: List[List[str]] - list of lists of generations
-        code_results = []
-        for idx, sample in enumerate(resps):
-            sample_metrics = []
-            for completion in sample:
-                processed_completion = preprocess_generation(completion)
-                result = execute_function(processed_completion, docs[idx])  # List
-                sample_metrics.extend([result])
-            code_results.extend([sample_metrics])
-        return code_results
+if not FILTER_REGISTRY.get("ruhumanevalscoring", None):
+    @register_filter("ruhumanevalscoring")
+    class ruHumanEvalScoring(Filter):
+        def __init__(self) -> None:
+            """
+            Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+            """
+
+        def apply(self, resps, docs):
+            """
+            Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
+            """
+            # resps: List[List[str]] - list of lists of generations
+            code_results = []
+            for idx, sample in enumerate(resps):
+                sample_metrics = []
+                for completion in sample:
+                    processed_completion = preprocess_generation(completion)
+                    result = execute_function(processed_completion, docs[idx])  # List
+                    sample_metrics.extend([result])
+                code_results.extend([sample_metrics])
+            return code_results
 
 
 def preprocess_generation(generation):

diff --git a/benchmark_tasks/rutie/utils.py b/benchmark_tasks/rutie/utils.py
@@ -44,10 +44,10 @@ def _update_request(storage, request):
 
     # when string passed (everywhere except for API calls)
     if isinstance(request.arguments[0], str):
-        new_req = replace_targets(request.arguments[0], max_num, storage)
+        new_req = replace_targets(request.arguments[0], max_num, storage).replace("{context}", "")
         request.arguments = (new_req, request.arguments[1])
     else:
-        new_req = replace_targets(request.arguments[0].prompt, max_num, storage)
+        new_req = replace_targets(request.arguments[0].prompt, max_num, storage).replace("{context}", "")
         new_req = JsonChatStr(new_req)
         request.arguments = (new_req, request.arguments[1])