From b9dd448c42dfd3bcb3b589b74cb76f29bdc84c8b Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 27 Feb 2026 12:09:36 +0200 Subject: [PATCH 1/6] fix: docDate parsing; refactor: remove dependency on deprecated function --- src/sample_pages_new.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/sample_pages_new.py b/src/sample_pages_new.py index 245d66c..9a968ec 100644 --- a/src/sample_pages_new.py +++ b/src/sample_pages_new.py @@ -5,9 +5,9 @@ import numpy as np import pandas as pd from lxml import etree -import argparse, progressbar, hashlib +import argparse, tqdm, hashlib -from pyriksdagen.utils import infer_metadata, protocol_iterators +from pyriksdagen.utils import infer_metadata, corpus_iterator tei_ns = "{http://www.tei-c.org/ns/1.0}" xml_ns = "{http://www.w3.org/XML/1998/namespace}" @@ -15,13 +15,14 @@ def get_date(root): for docDate in root.findall(f".//{tei_ns}docDate"): date_string = docDate.text + date_string = " ".join(date_string.split()).strip() break return date_string def get_page_counts(corpus_path="corpus/protocols/"): parser = etree.XMLParser(remove_blank_text=True) rows = [] - for protocol_path in progressbar.progressbar(list(protocol_iterators(corpus_path, start=args.start, end=args.end))): + for protocol_path in tqdm.tqdm(list(corpus_iterator("prot", corpus_root=corpus_path, start=args.start, end=args.end))): root = etree.parse(protocol_path, parser) pbs = root.findall(f".//{tei_ns}pb") year = get_date(root)[:4] @@ -32,11 +33,15 @@ def get_page_counts(corpus_path="corpus/protocols/"): return df def get_pagenumber(link): - link = link.replace(".jp2/_view", "") - link = link.split("-")[-1] - link = link.split("page=")[-1] - if link.isnumeric(): - return int(link) + if ".jp2" in link: + link = link.replace(".jp2/_view", "") + link = link.split("-")[-1] + link = link.split("page=")[-1] + if link.isnumeric(): + return int(link) + else: + raise UserError + def sample_page_counts(df, start, end, n, seed=None): df = df[df["year"] >= start] From b0498995ff924583eb7ae98e53118e3929e110c2 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 27 Feb 2026 12:12:27 +0200 Subject: [PATCH 2/6] refactor: remove unnecessary function --- src/sample_pages_new.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/sample_pages_new.py b/src/sample_pages_new.py index 9a968ec..ef7fd7a 100644 --- a/src/sample_pages_new.py +++ b/src/sample_pages_new.py @@ -32,17 +32,6 @@ def get_page_counts(corpus_path="corpus/protocols/"): df = pd.DataFrame(rows, columns=["protocol_path", "protocol_id", "year", "pages"]) return df -def get_pagenumber(link): - if ".jp2" in link: - link = link.replace(".jp2/_view", "") - link = link.split("-")[-1] - link = link.split("page=")[-1] - if link.isnumeric(): - return int(link) - else: - raise UserError - - def sample_page_counts(df, start, end, n, seed=None): df = df[df["year"] >= start] df = df[df["year"] <= end].copy() From 30eeaf181a20f9bb851276e5e758c8eead462928 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 27 Feb 2026 12:19:06 +0200 Subject: [PATCH 3/6] refactor: enable writing output to a single file instead of one per decade --- src/sample_pages_new.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/sample_pages_new.py b/src/sample_pages_new.py index ef7fd7a..8c475ab 100644 --- a/src/sample_pages_new.py +++ b/src/sample_pages_new.py @@ -133,6 +133,7 @@ def flatten(df): parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") parser.add_argument("--flatten", type=bool, default=False, help="Flatten output to only contain pages instead of elements") + parser.add_argument("--output_file", type=str, default=None, help="Write output here, to a single CSV file, intead of one per decade") args = parser.parse_args() digest = hashlib.md5(args.seed.encode("utf-8")).digest() @@ -142,6 +143,7 @@ def flatten(df): protocol_df = get_page_counts(path) print(protocol_df) + all_samples = [] for decade in range(args.start // 10 * 10, args.end, 10): print("Decade:", decade) sample = sample_page_counts(protocol_df, decade, decade + 9, n=args.pages_per_decade, seed=digest) @@ -163,9 +165,16 @@ def flatten(df): if args.flatten: sample = flatten(sample) - sample.to_csv(f"{args.qc_folder}/sample_{decade}.csv", index=False) - - protocols_unique = list(sample.protocol_id.unique()) - with open(f"{args.qc_folder}/sample_{decade}.txt", "w+") as outf: - for up in protocols_unique: - outf.write(f"{args.records_folder}/{up.split('-')[1]}/{up}.xml\n") + if args.output_file is None: + sample.to_csv(f"{args.qc_folder}/sample_{decade}.csv", index=False) + + protocols_unique = list(sample.protocol_id.unique()) + with open(f"{args.qc_folder}/sample_{decade}.txt", "w+") as outf: + for up in protocols_unique: + outf.write(f"{args.records_folder}/{up.split('-')[1]}/{up}.xml\n") + else: + all_samples.append(sample) + + if args.output_file is not None: + sample = pd.concat(all_samples) + sample.to_csv(args.output_file) From ce57c559b41516633ee38a0eca0caf9271d6be75 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 11 Mar 2026 15:01:32 +0100 Subject: [PATCH 4/6] refactor: cleaner imports --- src/sample_pages_new.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sample_pages_new.py b/src/sample_pages_new.py index 8c475ab..48c54a9 100644 --- a/src/sample_pages_new.py +++ b/src/sample_pages_new.py @@ -5,9 +5,11 @@ import numpy as np import pandas as pd from lxml import etree -import argparse, tqdm, hashlib +import argparse +import tqdm +import hashlib -from pyriksdagen.utils import infer_metadata, corpus_iterator +from pyriksdagen.utils import corpus_iterator tei_ns = "{http://www.tei-c.org/ns/1.0}" xml_ns = "{http://www.w3.org/XML/1998/namespace}" From c9fbe3a9a40926d4cdbd0ffe7d8835b9d5db74b9 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 11 Mar 2026 15:06:42 +0100 Subject: [PATCH 5/6] refactor: address review --- src/sample_pages_new.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/sample_pages_new.py b/src/sample_pages_new.py index 48c54a9..26d9b23 100644 --- a/src/sample_pages_new.py +++ b/src/sample_pages_new.py @@ -14,6 +14,9 @@ tei_ns = "{http://www.tei-c.org/ns/1.0}" xml_ns = "{http://www.w3.org/XML/1998/namespace}" +from trainerlog import get_logger +LOGGER = get_logger("sample-pages") + def get_date(root): for docDate in root.findall(f".//{tei_ns}docDate"): date_string = docDate.text @@ -129,7 +132,7 @@ def flatten(df): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--records_folder', type=str, default="corpus/protocols") parser.add_argument('--qc_folder', type=str, default="input/quality-control") - parser.add_argument("-f", '--seed', type=str, default=None, help="Random state seed") + parser.add_argument("-f", '--seed', type=str, required=True, help="Random state seed") parser.add_argument("-b", "--branch", type=str, default="main", help="Github branch where curation is happening.") parser.add_argument('-p', '--pages_per_decade', type=int, default=30, help="How many pages per decade? 30") parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") @@ -143,13 +146,13 @@ def flatten(df): path = args.records_folder protocol_df = get_page_counts(path) - print(protocol_df) + LOGGER.info(f"Protocols:\n{protocol_df}") all_samples = [] for decade in range(args.start // 10 * 10, args.end, 10): - print("Decade:", decade) + LOGGER.info(f"Decade: {decade}") sample = sample_page_counts(protocol_df, decade, decade + 9, n=args.pages_per_decade, seed=digest) - print(sample) + LOGGER.info(f"Sample:\n{sample}") prng = np.random.RandomState( (digest+decade) % (2**32)) sample = sample_pages(sample, random_state=prng) From 4a01df6e9645e517e3aab4ecb1a7fda1cb5d885c Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 11 Mar 2026 15:16:31 +0100 Subject: [PATCH 6/6] refactor: better printouts --- src/sample_pages_new.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sample_pages_new.py b/src/sample_pages_new.py index 26d9b23..a398d2d 100644 --- a/src/sample_pages_new.py +++ b/src/sample_pages_new.py @@ -25,6 +25,7 @@ def get_date(root): return date_string def get_page_counts(corpus_path="corpus/protocols/"): + LOGGER.info("Load records in to calculate page counts...") parser = etree.XMLParser(remove_blank_text=True) rows = [] for protocol_path in tqdm.tqdm(list(corpus_iterator("prot", corpus_root=corpus_path, start=args.start, end=args.end))): @@ -140,13 +141,14 @@ def flatten(df): parser.add_argument("--flatten", type=bool, default=False, help="Flatten output to only contain pages instead of elements") parser.add_argument("--output_file", type=str, default=None, help="Write output here, to a single CSV file, intead of one per decade") args = parser.parse_args() + LOGGER.train(f"Args: {args}") digest = hashlib.md5(args.seed.encode("utf-8")).digest() digest = int.from_bytes(digest, "big") % (2**32) path = args.records_folder protocol_df = get_page_counts(path) - LOGGER.info(f"Protocols:\n{protocol_df}") + LOGGER.info(f"Do sampling for the following records:\n{protocol_df}") all_samples = [] for decade in range(args.start // 10 * 10, args.end, 10):