Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 35 additions & 182 deletions src/corppa/poetry_detection/refmatcha.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
LABELED_EXCERPT_FIELDS,
fix_data_types,
)
from corppa.poetry_detection.ref_corpora import compile_metadata_df, fulltext_corpora

logger = logging.getLogger(__name__)

Expand All @@ -65,74 +66,31 @@
REF_DATA_DIR = pathlib.Path("poetry-reference-data")
TEXT_PARQUET_FILE = REF_DATA_DIR / "poems.parquet"
META_PARQUET_FILE = REF_DATA_DIR / "poem_metadata.parquet"
# csv files to supplement .txt files
POETRY_FOUNDATION_CSV = REF_DATA_DIR / "poetryfoundationdataset.csv"
CHADWYCK_HEALEY_CSV = REF_DATA_DIR / "chadwyck_healey_metadata.csv"
# define source ids to ensure we are consistent
SOURCE_ID = {
"Poetry Foundation": "poetry-foundation",
"Chadwyck-Healey": "chadwyck-healey",
"internet-poems": "internet_poems",
}


def compile_text(data_dir, output_file):
"""Compile reference poems into a parquet file for quick identification
of poetry excerpts based on matching text. Looks for text files in
directories under `data_dir`; uses the filename stem as poem identifier
and the containing directory name as the id for the source reference corpus.
Also looks for and includes content from `poetryfoundationdataset.csv`
contained in the data directory.
of poetry excerpts based on matching text.
"""

# parquet file schema:
# - poem id
# - text of the poem
# - source (identifier for the reference corpus)
schema = pa.schema(
[("id", pa.string()), ("text", pa.string()), ("source", pa.string())]
poem_text = pl.DataFrame(
[],
schema={
"poem_id": pl.String,
"text": pl.String,
"ref_corpus": pl.String,
},
)
# open a parquet writer so we can add records in chunks
pqwriter = pq.ParquetWriter(output_file, schema)

# handle files in batches
# look for .txt files in nested directories; use parent directory name as
# the reference corpus source name/id
for chunk in batched(iglob(f"{data_dir}/**/*.txt"), 1000):
chunk_files = [pathlib.Path(f) for f in chunk]
ids = [f.stem for f in chunk_files]
sources = [SOURCE_ID.get(f.parent.name, f.parent.name) for f in chunk_files]
texts = [f.open().read() for f in chunk_files]
# create and write a record batch
record_batch = pa.RecordBatch.from_arrays(
[ids, texts, sources], names=["id", "text", "source"]
)
pqwriter.write_batch(record_batch)

# poetry foundation text content is included in the csv file
if POETRY_FOUNDATION_CSV.exists():
# load poetry foundation csv into a polars dataframe
# - rename columns for our use
# - add source column
# - select only the columns we want to include
pf_df = (
pl.read_csv(POETRY_FOUNDATION_CSV)
.rename({"Poetry Foundation ID": "id", "Content": "text"})
.with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"]))
.select(["id", "text", "source"])
)
# convert polars dataframe to arrow table, cast to our schema to
# align types (large string vs string), then write out in batches
for batch in pf_df.to_arrow().cast(target_schema=schema).to_batches():
pqwriter.write_batch(batch)
else:
print(
f"Poetry Foundation csv file not found for text compilation (expected at {POETRY_FOUNDATION_CSV})",
file=sys.stderr,
)

# close the parquet file
pqwriter.close()
# for each corpus, load poem metadata into a polars dataframe,
# rename id to poem_id, and add a column with the corpus id
for ref_corpus in fulltext_corpora():
# NOTE: could enable progress bar here... but needs
# prefix/context
corpus_text = pl.from_dicts(
ref_corpus.get_text(disable_progress=False)
).with_columns(ref_corpus=pl.lit(ref_corpus.corpus_id))
poem_text.extend(corpus_text)
poem_text.write_parquet(output_file)


def compile_metadata(data_dir, output_file):
Expand All @@ -146,97 +104,9 @@ def compile_metadata(data_dir, output_file):
# for poem dataset output, we need poem id, author, and title
# to match text results, we need poem id and source id

schema = pa.schema(
[
("id", pa.string()),
("source", pa.string()),
("author", pa.string()),
("title", pa.string()),
]
)
# open a parquet writer for outputting content in batches
pqwriter = pq.ParquetWriter(output_file, schema)

# TODO: prioritize internet-poems matches over CH

# load chadwyck healey metadata
if CHADWYCK_HEALEY_CSV.exists():
# use polars to read in the csv and convert to the format we want
# - rename main title to title
# - add source id for all rows
# - combine author first and last name
# - reorder and limit columns to match parquet schema
df = (
# ignore parse errors in fields we don't care about (author_dob)
pl.read_csv(CHADWYCK_HEALEY_CSV, ignore_errors=True)
.rename({"title_main": "title"})
.with_columns(source=pl.lit(SOURCE_ID["Chadwyck-Healey"]))
.with_columns(
pl.concat_str(
[pl.col("author_firstname"), pl.col("author_lastname")],
separator=" ",
).alias("author")
)
.select(["id", "source", "author", "title"])
)
# convert polars dataframe to arrow table, cast to our schema to
# align types (large string vs string), then write out in batches
for batch in df.to_arrow().cast(target_schema=schema).to_batches():
pqwriter.write_batch(batch)
else:
print(
f"Chadwyck-Healey csv file not found for metadata compilation (expected at {CHADWYCK_HEALEY_CSV})",
file=sys.stderr,
)

# for the directory of internet poems, metadata is embedded in file name
internet_poems_dir = data_dir / "internet-poems"
# this directory is a set of manually curated texts;
# currently only 112 files, so don't worry about chunking until needed
poem_files = list(internet_poems_dir.glob("*.txt"))
# use filename without .txt as poem identifier
ids = [p.stem for p in poem_files]
# filename is : Firstname-Lastname_Poem-Title.txt
# author name: filename before the _ with dashes replaced with spaces
authors = [p.stem.split("_", 1)[0].replace("-", " ") for p in poem_files]
# title: same as author for the text after the _
titles = [p.stem.split("_", 1)[1].replace("-", " ") for p in poem_files]
source = [SOURCE_ID["internet-poems"]] * len(ids)

# create a record batch to write out
record_batch = pa.RecordBatch.from_arrays(
[ids, source, authors, titles], names=["id", "source", "author", "title"]
)
pqwriter.write_batch(record_batch)

# load poetry foundation data from csv file
# do this one last since it is least preferred of our sources
if POETRY_FOUNDATION_CSV.exists():
# use polars to read in the csv and convert to the format we want
# - rename columns to match desired output
# - add source id
# - reorder and limit columns to match parquet schema
df = (
pl.read_csv(POETRY_FOUNDATION_CSV)
# .drop("Content", "")
.rename(
{"Author": "author", "Title": "title", "Poetry Foundation ID": "id"}
)
.with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"]))
.select(["id", "source", "author", "title"])
)
# convert polars dataframe to arrow table, cast to our schema to
# align types (large string vs string), then write out in batches
for batch in df.to_arrow().cast(target_schema=schema).to_batches():
pqwriter.write_batch(batch)
else:
print(
f"Poetry Foundation csv file not found for metadata compilation (expected at {POETRY_FOUNDATION_CSV})",
file=sys.stderr,
)

# close the parquet file
pqwriter.close()
meta_df = compile_metadata_df()
# TODO: add an option to request full-text corpora only?
meta_df.write_parquet(output_file)


# unicode line separator; used in some internet poems text files
Expand All @@ -253,6 +123,7 @@ def _text_for_search(expr):
.str.replace_all(r"(\w) \| -(\w)", "$1$2")
# replace other punctuation with spaces
.str.replace_all("[[:punct:]]", " ")
.str.replace_all("\\*", "") # remove asterisk
.str.replace_all(
LINE_SEPARATOR, "\n"
) # replace unicode line separator with newline
Expand Down Expand Up @@ -338,20 +209,6 @@ def multiple_matches(filtered_ref_df):
if match_df is not None:
return match_df, reason

# if author/title duplication check failed, check for author matches
# poetry foundation includes Shakespeare drama excerpts with alternate names
authordupe_df = df.filter(df.select(["_author"]).is_duplicated())
if not authordupe_df.is_empty():
# Shakespeare shows up oddly in poetry foundation;
# if author matches, assume the other source has the correct title
non_poetryfoundtn = authordupe_df.filter(
pl.col("source") != SOURCE_ID["Poetry Foundation"]
)
if non_poetryfoundtn.height == 1:
match_df = non_poetryfoundtn.limit(1)
reason = "duplicate author but not title; excluding Poetry Foundation"
return match_df, reason

return None, None


Expand Down Expand Up @@ -385,9 +242,9 @@ def identify_excerpt(
)
match_info = None
result = None
# preserve any notes on the incoming excerpt
# (is this what we want? notes might get duplicated if/when we merge...)
note_lines = [excerpt_row["notes"]] if excerpt.notes is not None else []
# notes on the incoming excerpt are ignored, since we assume
# the output will be merged.
# If we preserve existing notes here they will be duplicated on merge.

search_field = f"search_{search_text}"
search_field_label = search_text.replace("_", " ")
Expand Down Expand Up @@ -450,8 +307,6 @@ def identify_excerpt(
# but we only return labeled excerpts, so out of scope for now

if match_df is not None:
# rename columns for export
match_df = match_df.rename({"id": "poem_id", "source": "ref_corpus"})
# get the first row as a dictionary
match_info = match_df.row(0, named=True)

Expand Down Expand Up @@ -489,9 +344,7 @@ def identify_excerpt(
]

# add note about how the match was determined
# return as new field; must be merged with notes in calling code
note_lines.append(f"{SCRIPT_ID}: {id_note}")
match_info["notes"] = "\n".join(note_lines).strip()
match_info["notes"] = f"{SCRIPT_ID}: {id_note}"
# set id method
match_info["identification_methods"] = {SCRIPT_ID}

Expand Down Expand Up @@ -596,13 +449,13 @@ def process(input_file, output_file, recompile=False):
reference_df = pl.read_parquet(TEXT_PARQUET_FILE)
meta_df = pl.read_parquet(META_PARQUET_FILE)
print(f"Poetry reference text data: {reference_df.height:,} entries")
print("total by source")
source_counts = reference_df["source"].value_counts()
print("total by reference corpus")
source_counts = reference_df["ref_corpus"].value_counts()
for value, count in source_counts.iter_rows():
# row is a tuple of value, count
print(f"\t{value}: {count:,}")

# some texts from poetry foundation and maybe Chadwyck-Healey are truncated
# some texts from Chadwyck-Healey are truncated
# discard them to avoid bad partial/fuzzy matches
reference_df = reference_df.with_columns(text_length=pl.col("text").str.len_chars())
min_length = 15
Expand All @@ -611,8 +464,8 @@ def process(input_file, output_file, recompile=False):
print(f" Omitting {short_texts.height} poems with text length < {min_length}")

print(f"Poetry reference metadata: {meta_df.height:,} entries")
print("total by source")
source_counts = meta_df["source"].value_counts()
print("total by reference corpus")
source_counts = meta_df["ref_corpus"].value_counts()
for value, count in source_counts.iter_rows():
# row is a tuple of value, count
print(f"\t{value}: {count:,}")
Expand All @@ -621,16 +474,16 @@ def process(input_file, output_file, recompile=False):
reference_df = reference_df.join(
meta_df,
# join on the combination of poem id and source id
on=pl.concat_str([pl.col("id"), pl.col("source")], separator="|"),
on=pl.concat_str([pl.col("poem_id"), pl.col("ref_corpus")], separator="|"),
how="left", # occasionally ids do not match,
# e.g. Chadwyck Healey poem id we have text for but not in metadata
).drop("id_right", "source_right")
).drop("poem_id_right", "ref_corpus_right")

# generate a simplified text field for searching
# NOTE: this part is a bit slow
reference_df = generate_search_text(reference_df)

# load csv with excerpt fieldnames
# load csv with excerpt field names
try:
input_df = fix_data_types(pl.read_csv(input_file, columns=EXCERPT_FIELDS))
except pl.exceptions.NoDataError as err:
Expand Down
Loading
Loading