Skip to content

Commit 8728f96

Browse files
committed
feat: export to ndjson, export metadata to .sqlite
1 parent fc78498 commit 8728f96

1 file changed

Lines changed: 49 additions & 11 deletions

File tree

src/export_records.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import multiprocessing
77
from pyriksdagen.utils import (
88
get_formatted_uuid,
9-
elem_iter
9+
elem_iter,
10+
infer_metadata
1011
)
1112
from pyriksdagen.utils import (
1213
TEI_NS,
@@ -23,16 +24,33 @@
2324
from trainerlog import get_logger
2425
from tqdm import tqdm
2526
import polars as pl
27+
from pathlib import Path
2628

2729
LOGGER = get_logger(name="export-records")
2830

2931
def scrape_record(record):
3032
root, _ = parse_tei(record, get_ns=True)
31-
33+
# Get protocol metadata
34+
record_id = root.attrib[f"{XML_NS}id"]
35+
metadata = infer_metadata(record_id)
36+
metadata["record"] = record_id
37+
38+
for front in root.findall(f".//{TEI_NS}front"):
39+
for docDate in front.findall(f".//{TEI_NS}docDate"):
40+
date = docDate.attrib["when"]
41+
if metadata.get("start_date") is None:
42+
metadata["start_date"] = date
43+
metadata["end_date"] = date
44+
45+
if metadata.get("start_date") > date:
46+
metadata["start_date"] = date
47+
if metadata.get("end_date") < date:
48+
metadata["end_date"] = date
49+
50+
51+
# Get speeches
3252
speeches = {}
3353
all_u_ids = set()
34-
# Add IDs for divs
35-
record_id = root.attrib[f"{XML_NS}id"]
3654
for textDesc in root.findall(f".//{TEI_NS}textDesc"):
3755
for constitution in textDesc.findall(f".//{TEI_NS}constitution"):
3856
speech_index = 0
@@ -51,7 +69,7 @@ def scrape_record(record):
5169
speech_index += 1
5270

5371
if len(speeches) == 0:
54-
return None
72+
return None, metadata
5573

5674
for u in root.findall(f".//{TEI_NS}u"):
5775
u_id = u.attrib[f"{XML_NS}id"]
@@ -84,14 +102,16 @@ def scrape_record(record):
84102

85103
df = pl.DataFrame(speech_list)
86104
df = df.select("speech", "record", "ix", "who", "text")
87-
return df
105+
return df, metadata
88106

89107

90108
def main(args):
91109
protocols = args.records
92110
all_dfs = []
111+
record_metadata = []
93112
for record in tqdm(args.records):
94-
df = scrape_record(record)
113+
df, metadata = scrape_record(record)
114+
record_metadata.append(metadata)
95115
if df is None:
96116
LOGGER.error(f"No speeches in {record}")
97117
else:
@@ -102,12 +122,30 @@ def main(args):
102122
df = df.select("speech", "record", "who", "text")
103123
print(df)
104124

105-
df.write_database(
106-
table_name="records_speeches",
107-
connection="sqlite:///records_speeches.sqlite",
108-
)
125+
metadata_df = pl.DataFrame(record_metadata)
126+
metadata_df = metadata_df.select("record", "sitting", "chamber", "number", "start_date", "end_date")
127+
metadata_df = metadata_df.sort("sitting", "chamber", "number")
128+
print(metadata_df.columns)
129+
130+
if "sqlite" in args.formats:
131+
LOGGER.train("Export to sqlite")
132+
if Path("records.sqlite").exists():
133+
Path("records.sqlite").unlink()
134+
df.write_database(
135+
table_name="speeches",
136+
connection="sqlite:///records.sqlite",
137+
)
138+
metadata_df.write_database(
139+
table_name="records",
140+
connection="sqlite:///records.sqlite",
141+
)
142+
143+
if "ndjson" in args.formats:
144+
LOGGER.train("Export to ndjson")
145+
df.write_ndjson("records_speeches.ndjson")
109146

110147
if __name__ == "__main__":
111148
parser = fetch_parser("records")
149+
parser.add_argument("--formats", type=str, default=["sqlite", "ndjson"])
112150
args = impute_args(parser.parse_args())
113151
main(args)

0 commit comments

Comments
 (0)