Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/muse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

__version__ = "0.1.dev1"

__all__ = ["__version__", "evaluation", "parallel_corpus", "translation"]
__all__ = ["__version__", "annotation", "evaluation", "parallel_corpus", "translation"]
137 changes: 137 additions & 0 deletions src/muse/annotation/annotation_recipes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
This module provides custom recipes for Prodigy annotation.

Recipes:
* ``concept-eval``: Notion concept evaluation recipe.

Example Usage:

prodigy concept-eval muse_concepts notion-concept-tasks.jsonl -F annotation_recipes.py
"""

from collections.abc import Iterator

import spacy
from prodigy import log, set_hashes
from prodigy.components.preprocess import tokenize_example
from prodigy.components.stream import get_stream
from prodigy.core import Arg, recipe
from prodigy.types import RecipeSettingsType, StreamType


def add_tokens(stream: StreamType) -> Iterator[StreamType]:
"""
Workaround to add tokens using appropriate tokenizer.
"""
tokenizers = {}

for ex in stream:
lang = ex["tr_lang"]
if lang not in tokenizers:
tokenizers[lang] = spacy.blank(lang)
nlp = tokenizers[lang]
yield tokenize_example(ex, nlp(ex["text"]))


def add_questions(questions, stream: StreamType) -> Iterator[StreamType]:
for ex in stream:
yield ex | {"questions": questions}


@recipe(
"concept-eval",
dataset=Arg(help="Dataset to save answers to"),
source=Arg(help="The source data as a JSONL file"),
)
def concept_eval_recipe(
dataset: str,
source: str,
) -> RecipeSettingsType:
# TODO: Consider adding an instruction page. See https://prodi.gy/docs/api-web-app#instructions
log("RECIPE: Starting recipe concept-eval", locals())

def validate_answer(eg) -> None:
q1_spans = eg.get("spans", [])
q2_selected = eg.get("accept", [])

# Validate Q1 answer
if len(q1_spans) == 0 and "missing" not in q2_selected:
raise ValueError(
"Must select the translation of the concept if it wasn't omitted entirely"
)
# Validate Q2 answer
if len(q2_selected) == 0:
raise ValueError("Missing answer for Q2")
elif "missing" in q2_selected and len(q1_spans) > 0:
raise ValueError(
"If the concept was omitted in the translation, no selections should be made for Q1"
)

# Question prompts for task
questions = [
"Q1. For the folloing translation, highlight the translation of the concept",
"Q2. Evaluate the machine translation of the concept",
"Q3. Notes / observations",
]

# Initial html template for starting text
init_html_tmpl = "\n".join(
[
"<h2>Concept: {{term}}</h2>",
"<p><b>Source Text</b>",
"{{src_text}}",
"<details>",
"\t<summary><b>Professional English Translation</b></summary>{{ref_text}}",
"</details>",
f"<hr><b>{questions[0]}</b>",
]
)

options = [
{"id": "correct", "text": "Correct"},
{"id": "partial", "text": "Partially correct"},
{"id": "wrong", "text": "Incorrect"},
{"id": "verbatim", "text": "Copied verbatim"},
{"id": "missing", "text": "Missing / Omitted"},
]

blocks = [
{"view_id": "html", "html_template": init_html_tmpl},
{"view_id": "ner_manual", "labels": ["CONCEPT"]},
{"view_id": "html", "html": f"<hr><b>{questions[1]}</b>"},
{"view_id": "choice", "text": None, "options": options},
{"view_id": "html", "html": f"<hr><b>{questions[2]}</b>"},
{"view_id": "text_input", "field_rows": 3},
]

# Setup config
config = {
"buttons": ["accept", "undo"], # remove reject and ignore buttons
"show_flag": True, # show flag button to mark weird machine translations
"honor_token_whitespace": True, # reflect whitespace accurately (e.g. in case of leading/trailing spaces)
"blocks": blocks,
"ner_manual_highlight_chars": True,
}

# Create stream
stream = get_stream(source)
stream.apply(add_tokens, stream)

# set hashes
def set_stream_hashes(stream: StreamType) -> Iterator[StreamType]:
for ex in stream:
yield set_hashes(
ex, input_keys=("tr_id"), task_keys=("questions", "spans", "options")
)

stream.apply(set_stream_hashes, stream)

components = {
"dataset": dataset,
"stream": stream,
"view_id": "blocks",
"config": config,
"validate_answer": validate_answer,
}

return components
83 changes: 83 additions & 0 deletions src/muse/annotation/build_notion_concept_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
This script is used to prepare the input for the Notion concept annotation task
with Prodigy. This corpus is built using the Notion parallel sentence corpus
and some number of Notion sentence translation corpora.

Example Usage:

build_notion_concept_tasks.py out.jsonl notion-parallel-sents.jsonl --mt-corpus mt_corpus.jsonl
build_notion_concept_tasks.py out.jsonl notion-parallel-sents.jsonl --mt-corpus mt1.jsonl mt2.jsonl
"""

import argparse
import pathlib
import sys

import polars as pl


def build_tasks(
parallel_corpus: pathlib.Path, mt_corpora: list[pathlib.Path], output: pathlib.Path
) -> None:
# Load parallel sentences
terms_df = (
pl.read_ndjson(parallel_corpus)
# Select terms of interest, namely the record id and term
.select(["id", "term"])
# Rename id to pair_id for join
.rename({"id": "pair_id"})
)
# Load machine translations
mt_df = (
pl.concat([pl.read_ndjson(corpus) for corpus in mt_corpora])
# Ignore back translations
.filter(pl.col("src_lang") != "en")
# Rename translation text to text so for span annotations in prodigy
.rename({"tr_text": "text"})
)

# Join dataframes on pair_id
result_df = mt_df.join(terms_df, "pair_id")

# Write output
result_df.write_ndjson(output)


def main():
parser = argparse.ArgumentParser(
description="Builds prodigy annotation tasks from Notion sentence translations"
)
parser.add_argument("output", type=pathlib.Path, help="Output prodigy task JSONL")
parser.add_argument(
"parallel_corpus", type=pathlib.Path, help="Parallel notion sentence corpus"
)
parser.add_argument(
"--mt-corpus",
nargs="+",
type=pathlib.Path,
required=True,
help="One or more machine translation corpora",
)

args = parser.parse_args()

if not args.parallel_corpus:
print(f"Error: {args.parallel_corpus} does not exist", sys.stderr)
sys.exit(1)
for f in args.mt_corpus:
if not f.is_file():
print(f"Error: {f} does not exist", sys.stderr)
sys.exit(1)
if args.output.is_file():
print(f"Error: {args.output} exist. Not overwriting.")
sys.exit(1)

build_tasks(
args.parallel_corpus,
args.mt_corpus,
args.output,
)


if __name__ == "__main__":
main()