From fc78498ebc276c2ec970fe877b035253de7960d6 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 11 Mar 2026 19:12:33 +0100 Subject: [PATCH 01/10] feat: collate all speeches and export to .sqlite database --- src/export_records.py | 113 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 src/export_records.py diff --git a/src/export_records.py b/src/export_records.py new file mode 100644 index 0000000..6498c37 --- /dev/null +++ b/src/export_records.py @@ -0,0 +1,113 @@ +""" +Add a randomly generated UUID to all elements in the XML ID attribute that are currently missing one. + +Also adds the document ID (eg. prot-year--number) in the TEI element as an XML ID attribute if its missing. +""" +import multiprocessing +from pyriksdagen.utils import ( + get_formatted_uuid, + elem_iter +) +from pyriksdagen.utils import ( + TEI_NS, + XML_NS +) +from pyriksdagen.io import ( + parse_tei, + write_tei +) +from pyriksdagen.args import ( + fetch_parser, + impute_args, +) +from trainerlog import get_logger +from tqdm import tqdm +import polars as pl + +LOGGER = get_logger(name="export-records") + +def scrape_record(record): + root, _ = parse_tei(record, get_ns=True) + + speeches = {} + all_u_ids = set() + # Add IDs for divs + record_id = root.attrib[f"{XML_NS}id"] + for textDesc in root.findall(f".//{TEI_NS}textDesc"): + for constitution in textDesc.findall(f".//{TEI_NS}constitution"): + speech_index = 0 + for speech_note in constitution: + speech_id = speech_note.attrib[f"{XML_NS}id"] + #print(speech_id) + + # scrape u tags from linkGrp + u_ids = set() + for ptr in speech_note.findall(f".//{TEI_NS}ptr"): + u_id = ptr.attrib["target"].replace("#", "") + u_ids.add(u_id) + all_u_ids.add(u_id) + + speeches[speech_id] = {"record": record_id, "u_ids": u_ids, "who": None, "text": None, "ix": speech_index} + speech_index += 1 + + if len(speeches) == 0: + return None + + for u in root.findall(f".//{TEI_NS}u"): + u_id = u.attrib[f"{XML_NS}id"] + if u_id in all_u_ids: + LOGGER.debug(f"u {u_id} in all u ids") + speech = None + for i in speeches: + if u_id in speeches[i]["u_ids"]: + speech = i + + LOGGER.debug(f"u {u_id} belongs to speech: {speech}") + who = u.attrib["who"] + if who == "unknown": + who = None + speeches[speech]["who"] = who + for seg in u: + text = " ".join(seg.text.split()) + if speeches[speech]["text"] is None: + speeches[speech]["text"] = text + else: + speeches[speech]["text"] += "\n\n" + text + + + + speech_list = [] + for speech_id in speeches: + speech_dict = speeches[speech_id] + speech_dict["speech"] = speech_id + speech_list.append(speech_dict) + + df = pl.DataFrame(speech_list) + df = df.select("speech", "record", "ix", "who", "text") + return df + + +def main(args): + protocols = args.records + all_dfs = [] + for record in tqdm(args.records): + df = scrape_record(record) + if df is None: + LOGGER.error(f"No speeches in {record}") + else: + all_dfs.append(df) + + df = pl.concat(all_dfs) + df = df.sort("record", "ix") + df = df.select("speech", "record", "who", "text") + print(df) + + df.write_database( + table_name="records_speeches", + connection="sqlite:///records_speeches.sqlite", + ) + +if __name__ == "__main__": + parser = fetch_parser("records") + args = impute_args(parser.parse_args()) + main(args) \ No newline at end of file From 8728f964d795f2d5d81f2e13099eed5deada1e0d Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 11 Mar 2026 19:27:03 +0100 Subject: [PATCH 02/10] feat: export to ndjson, export metadata to .sqlite --- src/export_records.py | 60 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index 6498c37..031dd02 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -6,7 +6,8 @@ import multiprocessing from pyriksdagen.utils import ( get_formatted_uuid, - elem_iter + elem_iter, + infer_metadata ) from pyriksdagen.utils import ( TEI_NS, @@ -23,16 +24,33 @@ from trainerlog import get_logger from tqdm import tqdm import polars as pl +from pathlib import Path LOGGER = get_logger(name="export-records") def scrape_record(record): root, _ = parse_tei(record, get_ns=True) - + # Get protocol metadata + record_id = root.attrib[f"{XML_NS}id"] + metadata = infer_metadata(record_id) + metadata["record"] = record_id + + for front in root.findall(f".//{TEI_NS}front"): + for docDate in front.findall(f".//{TEI_NS}docDate"): + date = docDate.attrib["when"] + if metadata.get("start_date") is None: + metadata["start_date"] = date + metadata["end_date"] = date + + if metadata.get("start_date") > date: + metadata["start_date"] = date + if metadata.get("end_date") < date: + metadata["end_date"] = date + + + # Get speeches speeches = {} all_u_ids = set() - # Add IDs for divs - record_id = root.attrib[f"{XML_NS}id"] for textDesc in root.findall(f".//{TEI_NS}textDesc"): for constitution in textDesc.findall(f".//{TEI_NS}constitution"): speech_index = 0 @@ -51,7 +69,7 @@ def scrape_record(record): speech_index += 1 if len(speeches) == 0: - return None + return None, metadata for u in root.findall(f".//{TEI_NS}u"): u_id = u.attrib[f"{XML_NS}id"] @@ -84,14 +102,16 @@ def scrape_record(record): df = pl.DataFrame(speech_list) df = df.select("speech", "record", "ix", "who", "text") - return df + return df, metadata def main(args): protocols = args.records all_dfs = [] + record_metadata = [] for record in tqdm(args.records): - df = scrape_record(record) + df, metadata = scrape_record(record) + record_metadata.append(metadata) if df is None: LOGGER.error(f"No speeches in {record}") else: @@ -102,12 +122,30 @@ def main(args): df = df.select("speech", "record", "who", "text") print(df) - df.write_database( - table_name="records_speeches", - connection="sqlite:///records_speeches.sqlite", - ) + metadata_df = pl.DataFrame(record_metadata) + metadata_df = metadata_df.select("record", "sitting", "chamber", "number", "start_date", "end_date") + metadata_df = metadata_df.sort("sitting", "chamber", "number") + print(metadata_df.columns) + + if "sqlite" in args.formats: + LOGGER.train("Export to sqlite") + if Path("records.sqlite").exists(): + Path("records.sqlite").unlink() + df.write_database( + table_name="speeches", + connection="sqlite:///records.sqlite", + ) + metadata_df.write_database( + table_name="records", + connection="sqlite:///records.sqlite", + ) + + if "ndjson" in args.formats: + LOGGER.train("Export to ndjson") + df.write_ndjson("records_speeches.ndjson") if __name__ == "__main__": parser = fetch_parser("records") + parser.add_argument("--formats", type=str, default=["sqlite", "ndjson"]) args = impute_args(parser.parse_args()) main(args) \ No newline at end of file From d3891c270aa046dfdb9d23651161ffcd514dcefc Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 12 Mar 2026 11:00:38 +0100 Subject: [PATCH 03/10] fix: cast who to String so that it wont be null --- src/export_records.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index 031dd02..e90a250 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -100,8 +100,11 @@ def scrape_record(record): speech_dict["speech"] = speech_id speech_list.append(speech_dict) - df = pl.DataFrame(speech_list) + df = pl.DataFrame(speech_list, infer_schema_length=None) df = df.select("speech", "record", "ix", "who", "text") + + # Make sure who is pl.String in case all who's happen to be null + df = df.with_columns(pl.col("who").cast(pl.String)) return df, metadata @@ -120,12 +123,10 @@ def main(args): df = pl.concat(all_dfs) df = df.sort("record", "ix") df = df.select("speech", "record", "who", "text") - print(df) metadata_df = pl.DataFrame(record_metadata) metadata_df = metadata_df.select("record", "sitting", "chamber", "number", "start_date", "end_date") metadata_df = metadata_df.sort("sitting", "chamber", "number") - print(metadata_df.columns) if "sqlite" in args.formats: LOGGER.train("Export to sqlite") @@ -140,9 +141,19 @@ def main(args): connection="sqlite:///records.sqlite", ) + # Flattened formats + df = df.join(metadata_df, on="record") + df.sort("sitting", "chamber", "number") + df = df.with_columns(pl.col("sitting").str.head(3).alias("decade")) + if "ndjson" in args.formats: LOGGER.train("Export to ndjson") - df.write_ndjson("records_speeches.ndjson") + for decade in sorted(set(df["decade"])): + df_decade = df.filter(pl.col("decade") == decade) + df_decade_columns = [col for col in df_decade.columns if col != "decade"] + df_decade = df_decade.select(df_decade_columns) + LOGGER.info(f"{decade}:\ndf_decade") + df_decade.write_ndjson(f"records_speeches_{decade}0s.ndjson") if __name__ == "__main__": parser = fetch_parser("records") From 44bafc8123876a4cc59e93ebd8b088b0ae961f6c Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 12 Mar 2026 13:17:28 +0100 Subject: [PATCH 04/10] chore: option to export to ndjson vs decade ndjson --- src/export_records.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index e90a250..65b8e44 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -116,7 +116,7 @@ def main(args): df, metadata = scrape_record(record) record_metadata.append(metadata) if df is None: - LOGGER.error(f"No speeches in {record}") + LOGGER.warning(f"No speeches in {record}") else: all_dfs.append(df) @@ -146,14 +146,19 @@ def main(args): df.sort("sitting", "chamber", "number") df = df.with_columns(pl.col("sitting").str.head(3).alias("decade")) - if "ndjson" in args.formats: - LOGGER.train("Export to ndjson") + if "ndjson-decade" in args.formats: + LOGGER.train("Export to ndjson by decade") for decade in sorted(set(df["decade"])): df_decade = df.filter(pl.col("decade") == decade) df_decade_columns = [col for col in df_decade.columns if col != "decade"] df_decade = df_decade.select(df_decade_columns) LOGGER.info(f"{decade}:\ndf_decade") df_decade.write_ndjson(f"records_speeches_{decade}0s.ndjson") + if "ndjson" in args.formats: + LOGGER.train("Export to one ndjson file") + df_decade_columns = [col for col in df.columns if col != "decade"] + df_decade = df.select(df_decade_columns) + df_decade.write_ndjson(f"records_speeches.ndjson") if __name__ == "__main__": parser = fetch_parser("records") From 1ef0987b98f89d5e388f046ffba468b8a4216638 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 13 Mar 2026 16:17:30 +0100 Subject: [PATCH 05/10] fix: maintain chronologic order of speeches within a record in the ndjson export --- src/export_records.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index 65b8e44..c9fac88 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -122,14 +122,15 @@ def main(args): df = pl.concat(all_dfs) df = df.sort("record", "ix") - df = df.select("speech", "record", "who", "text") + df = df.rename({"ix": "speech_number"}) + df = df.select("speech", "record", "who", "text", "speech_number") metadata_df = pl.DataFrame(record_metadata) metadata_df = metadata_df.select("record", "sitting", "chamber", "number", "start_date", "end_date") metadata_df = metadata_df.sort("sitting", "chamber", "number") if "sqlite" in args.formats: - LOGGER.train("Export to sqlite") + LOGGER.info("Export to sqlite") if Path("records.sqlite").exists(): Path("records.sqlite").unlink() df.write_database( @@ -143,11 +144,11 @@ def main(args): # Flattened formats df = df.join(metadata_df, on="record") - df.sort("sitting", "chamber", "number") + df = df.sort("sitting", "chamber", "number", "speech_number") df = df.with_columns(pl.col("sitting").str.head(3).alias("decade")) if "ndjson-decade" in args.formats: - LOGGER.train("Export to ndjson by decade") + LOGGER.info("Export to ndjson by decade") for decade in sorted(set(df["decade"])): df_decade = df.filter(pl.col("decade") == decade) df_decade_columns = [col for col in df_decade.columns if col != "decade"] @@ -162,6 +163,6 @@ def main(args): if __name__ == "__main__": parser = fetch_parser("records") - parser.add_argument("--formats", type=str, default=["sqlite", "ndjson"]) + parser.add_argument("--formats", type=str, nargs="+", default=["sqlite", "ndjson"]) args = impute_args(parser.parse_args()) main(args) \ No newline at end of file From e2e274644a067a9ccd2e7d4ae100443ec5f9b1f5 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 13 Mar 2026 16:22:38 +0100 Subject: [PATCH 06/10] refactor: delete unnecessary imports and print out args --- src/export_records.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index c9fac88..e11bc78 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -3,10 +3,7 @@ Also adds the document ID (eg. prot-year--number) in the TEI element as an XML ID attribute if its missing. """ -import multiprocessing from pyriksdagen.utils import ( - get_formatted_uuid, - elem_iter, infer_metadata ) from pyriksdagen.utils import ( @@ -15,7 +12,6 @@ ) from pyriksdagen.io import ( parse_tei, - write_tei ) from pyriksdagen.args import ( fetch_parser, @@ -164,5 +160,7 @@ def main(args): if __name__ == "__main__": parser = fetch_parser("records") parser.add_argument("--formats", type=str, nargs="+", default=["sqlite", "ndjson"]) - args = impute_args(parser.parse_args()) + args = parser.parse_args() + LOGGER.train(f"Args: {args}") + args = impute_args(args) main(args) \ No newline at end of file From 46dab881abf9e48f2007f08899d44225f13d979e Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 13 Mar 2026 16:33:20 +0100 Subject: [PATCH 07/10] fix: better naming in the ndjson export --- src/export_records.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index e11bc78..410c7c2 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -120,10 +120,12 @@ def main(args): df = df.sort("record", "ix") df = df.rename({"ix": "speech_number"}) df = df.select("speech", "record", "who", "text", "speech_number") + df = df.with_columns(pl.col("speech_number") + 1) metadata_df = pl.DataFrame(record_metadata) - metadata_df = metadata_df.select("record", "sitting", "chamber", "number", "start_date", "end_date") - metadata_df = metadata_df.sort("sitting", "chamber", "number") + metadata_df = metadata_df.rename({"sitting": "session"}) + metadata_df = metadata_df.select("record", "session", "chamber", "number", "start_date", "end_date") + metadata_df = metadata_df.sort("session", "chamber", "number") if "sqlite" in args.formats: LOGGER.info("Export to sqlite") @@ -140,8 +142,9 @@ def main(args): # Flattened formats df = df.join(metadata_df, on="record") - df = df.sort("sitting", "chamber", "number", "speech_number") - df = df.with_columns(pl.col("sitting").str.head(3).alias("decade")) + df = df.sort("session", "chamber", "number", "speech_number") + df = df.with_columns(pl.col("session").str.head(3).alias("decade")) + df = df.rename({"number": "record_number"}) if "ndjson-decade" in args.formats: LOGGER.info("Export to ndjson by decade") From 86608d6447f1cb34904c6df26deac4473a98c9c6 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Fri, 13 Mar 2026 16:38:53 +0100 Subject: [PATCH 08/10] docs: add docstring --- src/export_records.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index 410c7c2..cd08ce6 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -1,7 +1,8 @@ """ -Add a randomly generated UUID to all elements in the XML ID attribute that are currently missing one. +Export the speeches in the records to newline delimited JSON and/or sqlite -Also adds the document ID (eg. prot-year--number) in the TEI element as an XML ID attribute if its missing. +The NDJSON output is flattened in the sense that each row has record level metadata +while the sqlite is not and has two tables """ from pyriksdagen.utils import ( infer_metadata From 0a9775cb2e4bab57e7aaa2efcf60a586fe37e99c Mon Sep 17 00:00:00 2001 From: ninpnin Date: Tue, 24 Mar 2026 10:23:08 +0200 Subject: [PATCH 09/10] refactor: use standard logging levels; remove unnecessary variable --- src/export_records.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/export_records.py b/src/export_records.py index cd08ce6..0b5d2a3 100644 --- a/src/export_records.py +++ b/src/export_records.py @@ -106,7 +106,6 @@ def scrape_record(record): def main(args): - protocols = args.records all_dfs = [] record_metadata = [] for record in tqdm(args.records): @@ -156,7 +155,7 @@ def main(args): LOGGER.info(f"{decade}:\ndf_decade") df_decade.write_ndjson(f"records_speeches_{decade}0s.ndjson") if "ndjson" in args.formats: - LOGGER.train("Export to one ndjson file") + LOGGER.info("Export to one ndjson file") df_decade_columns = [col for col in df.columns if col != "decade"] df_decade = df.select(df_decade_columns) df_decade.write_ndjson(f"records_speeches.ndjson") @@ -165,6 +164,6 @@ def main(args): parser = fetch_parser("records") parser.add_argument("--formats", type=str, nargs="+", default=["sqlite", "ndjson"]) args = parser.parse_args() - LOGGER.train(f"Args: {args}") + LOGGER.info(f"Args: {args}") args = impute_args(args) main(args) \ No newline at end of file From fa23b1e8022c9a83eefe8b9dbd7e3228a4ebc1e3 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Tue, 24 Mar 2026 10:23:23 +0200 Subject: [PATCH 10/10] fix: add missing dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b826ea7..a78605e 100755 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ transformers Unidecode Wikidata pytest-cfg-fetcher +polars \ No newline at end of file