Princeton-CDH · laurejt · Feb 25, 2026 · Feb 24, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/src/muse/__init__.py b/src/muse/__init__.py
@@ -3,4 +3,4 @@
 
 __version__ = "0.1.dev1"
 
-__all__ = ["__version__", "evaluation", "parallel_corpus", "translation"]
+__all__ = ["__version__", "annotation", "evaluation", "parallel_corpus", "translation"]
diff --git a/src/muse/annotation/annotation_recipes.py b/src/muse/annotation/annotation_recipes.py
@@ -0,0 +1,137 @@
+"""
+This module provides custom recipes for Prodigy annotation.
+
+Recipes:
+    * ``concept-eval``: Notion concept evaluation recipe.
+
+Example Usage:
+
+    prodigy concept-eval muse_concepts notion-concept-tasks.jsonl -F annotation_recipes.py
+"""
+
+from collections.abc import Iterator
+
+import spacy
+from prodigy import log, set_hashes
+from prodigy.components.preprocess import tokenize_example
+from prodigy.components.stream import get_stream
+from prodigy.core import Arg, recipe
+from prodigy.types import RecipeSettingsType, StreamType
+
+
+def add_tokens(stream: StreamType) -> Iterator[StreamType]:
+    """
+    Workaround to add tokens using appropriate tokenizer.
+    """
+    tokenizers = {}
+
+    for ex in stream:
+        lang = ex["tr_lang"]
+        if lang not in tokenizers:
+            tokenizers[lang] = spacy.blank(lang)
+        nlp = tokenizers[lang]
+        yield tokenize_example(ex, nlp(ex["text"]))
+
+
+def add_questions(questions, stream: StreamType) -> Iterator[StreamType]:
+    for ex in stream:
+        yield ex | {"questions": questions}
+
+
+@recipe(
+    "concept-eval",
+    dataset=Arg(help="Dataset to save answers to"),
+    source=Arg(help="The source data as a JSONL file"),
+)
+def concept_eval_recipe(
+    dataset: str,
+    source: str,
+) -> RecipeSettingsType:
+    # TODO: Consider adding an instruction page. See https://prodi.gy/docs/api-web-app#instructions
+    log("RECIPE: Starting recipe concept-eval", locals())
+
+    def validate_answer(eg) -> None:
+        q1_spans = eg.get("spans", [])
+        q2_selected = eg.get("accept", [])
+
+        # Validate Q1 answer
+        if len(q1_spans) == 0 and "missing" not in q2_selected:
+            raise ValueError(
+                "Must select the translation of the concept if it wasn't omitted entirely"
+            )
+        # Validate Q2 answer
+        if len(q2_selected) == 0:
+            raise ValueError("Missing answer for Q2")
+        elif "missing" in q2_selected and len(q1_spans) > 0:
+            raise ValueError(
+                "If the concept was omitted in the translation, no selections should be made for Q1"
+            )
+
+    # Question prompts for task
+    questions = [
+        "Q1. For the folloing translation, highlight the translation of the concept",
+        "Q2. Evaluate the machine translation of the concept",
+        "Q3. Notes / observations",
+    ]
+
+    # Initial html template for starting text
+    init_html_tmpl = "\n".join(
+        [
+            "<h2>Concept: {{term}}</h2>",
+            "<p><b>Source Text</b>",
+            "{{src_text}}",
+            "<details>",
+            "\t<summary><b>Professional English Translation</b></summary>{{ref_text}}",
+            "</details>",
+            f"<hr><b>{questions[0]}</b>",
+        ]
+    )
+
+    options = [
+        {"id": "correct", "text": "Correct"},
+        {"id": "partial", "text": "Partially correct"},
+        {"id": "wrong", "text": "Incorrect"},
+        {"id": "verbatim", "text": "Copied verbatim"},
+        {"id": "missing", "text": "Missing / Omitted"},
+    ]
+
+    blocks = [
+        {"view_id": "html", "html_template": init_html_tmpl},
+        {"view_id": "ner_manual", "labels": ["CONCEPT"]},
+        {"view_id": "html", "html": f"<hr><b>{questions[1]}</b>"},
+        {"view_id": "choice", "text": None, "options": options},
+        {"view_id": "html", "html": f"<hr><b>{questions[2]}</b>"},
+        {"view_id": "text_input", "field_rows": 3},
+    ]
+
+    # Setup config
+    config = {
+        "buttons": ["accept", "undo"],  # remove reject and ignore buttons
+        "show_flag": True,  # show flag button to mark weird machine translations
+        "honor_token_whitespace": True,  # reflect whitespace accurately (e.g. in case of leading/trailing spaces)
+        "blocks": blocks,
+        "ner_manual_highlight_chars": True,
+    }
+
+    # Create stream
+    stream = get_stream(source)
+    stream.apply(add_tokens, stream)
+
+    # set hashes
+    def set_stream_hashes(stream: StreamType) -> Iterator[StreamType]:
+        for ex in stream:
+            yield set_hashes(
+                ex, input_keys=("tr_id"), task_keys=("questions", "spans", "options")
+            )
+
+    stream.apply(set_stream_hashes, stream)
+
+    components = {
+        "dataset": dataset,
+        "stream": stream,
+        "view_id": "blocks",
+        "config": config,
+        "validate_answer": validate_answer,
+    }
+
+    return components
diff --git a/src/muse/annotation/build_notion_concept_tasks.py b/src/muse/annotation/build_notion_concept_tasks.py
@@ -0,0 +1,83 @@
+"""
+This script is used to prepare the input for the Notion concept annotation task
+with Prodigy. This corpus is built using the Notion parallel sentence corpus
+and some number of Notion sentence translation corpora.
+
+Example Usage:
+
+    build_notion_concept_tasks.py out.jsonl notion-parallel-sents.jsonl --mt-corpus mt_corpus.jsonl
+    build_notion_concept_tasks.py out.jsonl notion-parallel-sents.jsonl --mt-corpus mt1.jsonl mt2.jsonl
+"""
+
+import argparse
+import pathlib
+import sys
+
+import polars as pl
+
+
+def build_tasks(
+    parallel_corpus: pathlib.Path, mt_corpora: list[pathlib.Path], output: pathlib.Path
+) -> None:
+    # Load parallel sentences
+    terms_df = (
+        pl.read_ndjson(parallel_corpus)
+        # Select terms of interest, namely the record id and term
+        .select(["id", "term"])
+        # Rename id to pair_id for join
+        .rename({"id": "pair_id"})
+    )
+    # Load machine translations
+    mt_df = (
+        pl.concat([pl.read_ndjson(corpus) for corpus in mt_corpora])
+        # Ignore back translations
+        .filter(pl.col("src_lang") != "en")
+        # Rename translation text to text so for span annotations in prodigy
+        .rename({"tr_text": "text"})
+    )
+
+    # Join dataframes on pair_id
+    result_df = mt_df.join(terms_df, "pair_id")
+
+    # Write output
+    result_df.write_ndjson(output)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Builds prodigy annotation tasks from Notion sentence translations"
+    )
+    parser.add_argument("output", type=pathlib.Path, help="Output prodigy task JSONL")
+    parser.add_argument(
+        "parallel_corpus", type=pathlib.Path, help="Parallel notion sentence corpus"
+    )
+    parser.add_argument(
+        "--mt-corpus",
+        nargs="+",
+        type=pathlib.Path,
+        required=True,
+        help="One or more machine translation corpora",
+    )
+
+    args = parser.parse_args()
+
+    if not args.parallel_corpus:
+        print(f"Error: {args.parallel_corpus} does not exist", sys.stderr)
+        sys.exit(1)
+    for f in args.mt_corpus:
+        if not f.is_file():
+            print(f"Error: {f} does not exist", sys.stderr)
+            sys.exit(1)
+    if args.output.is_file():
+        print(f"Error: {args.output} exist. Not overwriting.")
+        sys.exit(1)
+
+    build_tasks(
+        args.parallel_corpus,
+        args.mt_corpus,
+        args.output,
+    )
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		__version__ = "0.1.dev1"

		__all__ = ["__version__", "evaluation", "parallel_corpus", "translation"]
		__all__ = ["__version__", "annotation", "evaluation", "parallel_corpus", "translation"]