Princeton-CDH · rlskoeser · Mar 25, 2025 · Mar 25, 2025
diff --git a/src/corppa/poetry_detection/refmatcha.py b/src/corppa/poetry_detection/refmatcha.py
@@ -54,6 +54,7 @@
     LABELED_EXCERPT_FIELDS,
     fix_data_types,
 )
+from corppa.poetry_detection.ref_corpora import compile_metadata_df, fulltext_corpora
 
 logger = logging.getLogger(__name__)
 
@@ -65,74 +66,31 @@
 REF_DATA_DIR = pathlib.Path("poetry-reference-data")
 TEXT_PARQUET_FILE = REF_DATA_DIR / "poems.parquet"
 META_PARQUET_FILE = REF_DATA_DIR / "poem_metadata.parquet"
-# csv files to supplement .txt files
-POETRY_FOUNDATION_CSV = REF_DATA_DIR / "poetryfoundationdataset.csv"
-CHADWYCK_HEALEY_CSV = REF_DATA_DIR / "chadwyck_healey_metadata.csv"
-# define source ids to ensure we are consistent
-SOURCE_ID = {
-    "Poetry Foundation": "poetry-foundation",
-    "Chadwyck-Healey": "chadwyck-healey",
-    "internet-poems": "internet_poems",
-}
 
 
 def compile_text(data_dir, output_file):
     """Compile reference poems into a parquet file for quick identification
-    of poetry excerpts based on matching text. Looks for text files in
-    directories under `data_dir`; uses the filename stem as poem identifier
-    and the containing directory name as the id for the source reference corpus.
-    Also looks for and includes content from `poetryfoundationdataset.csv`
-    contained in the data directory.
+    of poetry excerpts based on matching text.
     """
-
-    # parquet file schema:
-    # - poem id
-    # - text of the poem
-    # - source (identifier for the reference corpus)
-    schema = pa.schema(
-        [("id", pa.string()), ("text", pa.string()), ("source", pa.string())]
+    poem_text = pl.DataFrame(
+        [],
+        schema={
+            "poem_id": pl.String,
+            "text": pl.String,
+            "ref_corpus": pl.String,
+        },
     )
-    # open a parquet writer so we can add records in chunks
-    pqwriter = pq.ParquetWriter(output_file, schema)
-
-    # handle files in batches
-    # look for .txt files in nested directories; use parent directory name as
-    # the reference corpus source name/id
-    for chunk in batched(iglob(f"{data_dir}/**/*.txt"), 1000):
-        chunk_files = [pathlib.Path(f) for f in chunk]
-        ids = [f.stem for f in chunk_files]
-        sources = [SOURCE_ID.get(f.parent.name, f.parent.name) for f in chunk_files]
-        texts = [f.open().read() for f in chunk_files]
-        # create and write a record batch
-        record_batch = pa.RecordBatch.from_arrays(
-            [ids, texts, sources], names=["id", "text", "source"]
-        )
-        pqwriter.write_batch(record_batch)
-
-    # poetry foundation text content is included in the csv file
-    if POETRY_FOUNDATION_CSV.exists():
-        # load poetry foundation csv into a polars dataframe
-        # - rename columns for our use
-        # - add source column
-        # - select only the columns we want to include
-        pf_df = (
-            pl.read_csv(POETRY_FOUNDATION_CSV)
-            .rename({"Poetry Foundation ID": "id", "Content": "text"})
-            .with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"]))
-            .select(["id", "text", "source"])
-        )
-        # convert polars dataframe to arrow table, cast to our schema to
-        # align types (large string vs string), then write out in batches
-        for batch in pf_df.to_arrow().cast(target_schema=schema).to_batches():
-            pqwriter.write_batch(batch)
-    else:
-        print(
-            f"Poetry Foundation csv file not found for text compilation (expected at {POETRY_FOUNDATION_CSV})",
-            file=sys.stderr,
-        )
 
-    # close the parquet file
-    pqwriter.close()
+    # for each corpus, load poem metadata into a polars dataframe,
+    # rename id to poem_id, and add a column with the corpus id
+    for ref_corpus in fulltext_corpora():
+        # NOTE: could enable progress bar here... but needs
+        # prefix/context
+        corpus_text = pl.from_dicts(
+            ref_corpus.get_text(disable_progress=False)
+        ).with_columns(ref_corpus=pl.lit(ref_corpus.corpus_id))
+        poem_text.extend(corpus_text)
+    poem_text.write_parquet(output_file)
 
 
 def compile_metadata(data_dir, output_file):
@@ -146,97 +104,9 @@ def compile_metadata(data_dir, output_file):
     # for poem dataset output, we need poem id, author, and title
     # to match text results, we need poem id and source id
 
-    schema = pa.schema(
-        [
-            ("id", pa.string()),
-            ("source", pa.string()),
-            ("author", pa.string()),
-            ("title", pa.string()),
-        ]
-    )
-    # open a parquet writer for outputting content in batches
-    pqwriter = pq.ParquetWriter(output_file, schema)
-
-    # TODO: prioritize internet-poems matches over CH
-
-    # load chadwyck healey metadata
-    if CHADWYCK_HEALEY_CSV.exists():
-        # use polars to read in the csv and convert to the format we want
-        # - rename main title to title
-        # - add source id for all rows
-        # - combine author first and last name
-        # - reorder and limit columns to match parquet schema
-        df = (
-            # ignore parse errors in fields we don't care about (author_dob)
-            pl.read_csv(CHADWYCK_HEALEY_CSV, ignore_errors=True)
-            .rename({"title_main": "title"})
-            .with_columns(source=pl.lit(SOURCE_ID["Chadwyck-Healey"]))
-            .with_columns(
-                pl.concat_str(
-                    [pl.col("author_firstname"), pl.col("author_lastname")],
-                    separator=" ",
-                ).alias("author")
-            )
-            .select(["id", "source", "author", "title"])
-        )
-        # convert polars dataframe to arrow table, cast to our schema to
-        # align types (large string vs string), then write out in batches
-        for batch in df.to_arrow().cast(target_schema=schema).to_batches():
-            pqwriter.write_batch(batch)
-    else:
-        print(
-            f"Chadwyck-Healey csv file not found for metadata compilation (expected at {CHADWYCK_HEALEY_CSV})",
-            file=sys.stderr,
-        )
-
-    # for the directory of internet poems, metadata is embedded in file name
-    internet_poems_dir = data_dir / "internet-poems"
-    # this directory is a set of manually curated texts;
-    # currently only 112 files, so don't worry about chunking until needed
-    poem_files = list(internet_poems_dir.glob("*.txt"))
-    # use filename without .txt as poem identifier
-    ids = [p.stem for p in poem_files]
-    # filename is : Firstname-Lastname_Poem-Title.txt
-    # author name: filename before the _ with dashes replaced with spaces
-    authors = [p.stem.split("_", 1)[0].replace("-", " ") for p in poem_files]
-    # title: same as author for the text after the _
-    titles = [p.stem.split("_", 1)[1].replace("-", " ") for p in poem_files]
-    source = [SOURCE_ID["internet-poems"]] * len(ids)
-
-    # create a record batch to write out
-    record_batch = pa.RecordBatch.from_arrays(
-        [ids, source, authors, titles], names=["id", "source", "author", "title"]
-    )
-    pqwriter.write_batch(record_batch)
-
-    # load poetry foundation data from csv file
-    # do this one last since it is least preferred of our sources
-    if POETRY_FOUNDATION_CSV.exists():
-        # use polars to read in the csv and convert to the format we want
-        # - rename columns to match desired output
-        # - add source id
-        # - reorder and limit columns to match parquet schema
-        df = (
-            pl.read_csv(POETRY_FOUNDATION_CSV)
-            # .drop("Content", "")
-            .rename(
-                {"Author": "author", "Title": "title", "Poetry Foundation ID": "id"}
-            )
-            .with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"]))
-            .select(["id", "source", "author", "title"])
-        )
-        # convert polars dataframe to arrow table, cast to our schema to
-        # align types (large string vs string), then write out in batches
-        for batch in df.to_arrow().cast(target_schema=schema).to_batches():
-            pqwriter.write_batch(batch)
-    else:
-        print(
-            f"Poetry Foundation csv file not found for metadata compilation (expected at {POETRY_FOUNDATION_CSV})",
-            file=sys.stderr,
-        )
-
-    # close the parquet file
-    pqwriter.close()
+    meta_df = compile_metadata_df()
+    # TODO: add an option to request full-text corpora only?
+    meta_df.write_parquet(output_file)
 
 
 # unicode line separator; used in some internet poems text files
@@ -253,6 +123,7 @@ def _text_for_search(expr):
         .str.replace_all(r"(\w) \| -(\w)", "$1$2")
         # replace other punctuation with spaces
         .str.replace_all("[[:punct:]]", " ")
+        .str.replace_all("\\*", "")  # remove asterisk
         .str.replace_all(
             LINE_SEPARATOR, "\n"
         )  # replace unicode line separator with newline
@@ -338,20 +209,6 @@ def multiple_matches(filtered_ref_df):
     if match_df is not None:
         return match_df, reason
 
-    # if author/title duplication check failed, check for author matches
-    # poetry foundation includes Shakespeare drama excerpts with alternate names
-    authordupe_df = df.filter(df.select(["_author"]).is_duplicated())
-    if not authordupe_df.is_empty():
-        # Shakespeare shows up oddly in poetry foundation;
-        # if author matches, assume the other source has the correct title
-        non_poetryfoundtn = authordupe_df.filter(
-            pl.col("source") != SOURCE_ID["Poetry Foundation"]
-        )
-        if non_poetryfoundtn.height == 1:
-            match_df = non_poetryfoundtn.limit(1)
-            reason = "duplicate author but not title; excluding Poetry Foundation"
-            return match_df, reason
-
     return None, None
 
 
@@ -385,9 +242,9 @@ def identify_excerpt(
     )
     match_info = None
     result = None
-    # preserve any notes on the incoming excerpt
-    # (is this what we want? notes might get duplicated if/when we merge...)
-    note_lines = [excerpt_row["notes"]] if excerpt.notes is not None else []
+    # notes on the incoming excerpt are ignored, since we assume
+    # the output will be merged.
+    # If we preserve existing notes here they will be duplicated on merge.
 
     search_field = f"search_{search_text}"
     search_field_label = search_text.replace("_", " ")
@@ -450,8 +307,6 @@ def identify_excerpt(
         # but we only return labeled excerpts, so out of scope for now
 
         if match_df is not None:
-            # rename columns for export
-            match_df = match_df.rename({"id": "poem_id", "source": "ref_corpus"})
             # get the first row as a dictionary
             match_info = match_df.row(0, named=True)
 
@@ -489,9 +344,7 @@ def identify_excerpt(
                 ]
 
             # add note about how the match was determined
-            # return as new field; must be merged with notes in calling code
-            note_lines.append(f"{SCRIPT_ID}: {id_note}")
-            match_info["notes"] = "\n".join(note_lines).strip()
+            match_info["notes"] = f"{SCRIPT_ID}: {id_note}"
             # set id method
             match_info["identification_methods"] = {SCRIPT_ID}
 
@@ -596,13 +449,13 @@ def process(input_file, output_file, recompile=False):
     reference_df = pl.read_parquet(TEXT_PARQUET_FILE)
     meta_df = pl.read_parquet(META_PARQUET_FILE)
     print(f"Poetry reference text data: {reference_df.height:,} entries")
-    print("total by source")
-    source_counts = reference_df["source"].value_counts()
+    print("total by reference corpus")
+    source_counts = reference_df["ref_corpus"].value_counts()
     for value, count in source_counts.iter_rows():
         # row is a tuple of value, count
         print(f"\t{value}: {count:,}")
 
-    # some texts from poetry foundation and maybe Chadwyck-Healey are truncated
+    # some texts from  Chadwyck-Healey are truncated
     # discard them to avoid bad partial/fuzzy matches
     reference_df = reference_df.with_columns(text_length=pl.col("text").str.len_chars())
     min_length = 15
@@ -611,8 +464,8 @@ def process(input_file, output_file, recompile=False):
     print(f"  Omitting {short_texts.height} poems with text length < {min_length}")
 
     print(f"Poetry reference metadata: {meta_df.height:,} entries")
-    print("total by source")
-    source_counts = meta_df["source"].value_counts()
+    print("total by reference corpus")
+    source_counts = meta_df["ref_corpus"].value_counts()
     for value, count in source_counts.iter_rows():
         # row is a tuple of value, count
         print(f"\t{value}: {count:,}")
@@ -621,16 +474,16 @@ def process(input_file, output_file, recompile=False):
     reference_df = reference_df.join(
         meta_df,
         # join on the combination of poem id and source id
-        on=pl.concat_str([pl.col("id"), pl.col("source")], separator="|"),
+        on=pl.concat_str([pl.col("poem_id"), pl.col("ref_corpus")], separator="|"),
         how="left",  # occasionally ids do not match,
         # e.g. Chadwyck Healey poem id we have text for but not in metadata
-    ).drop("id_right", "source_right")
+    ).drop("poem_id_right", "ref_corpus_right")
 
     # generate a simplified text field for searching
     # NOTE: this part is a bit slow
     reference_df = generate_search_text(reference_df)
 
-    # load csv with excerpt fieldnames
+    # load csv with excerpt field names
     try:
         input_df = fix_data_types(pl.read_csv(input_file, columns=EXCERPT_FIELDS))
     except pl.exceptions.NoDataError as err: