66import multiprocessing
77from pyriksdagen .utils import (
88 get_formatted_uuid ,
9- elem_iter
9+ elem_iter ,
10+ infer_metadata
1011)
1112from pyriksdagen .utils import (
1213 TEI_NS ,
2324from trainerlog import get_logger
2425from tqdm import tqdm
2526import polars as pl
27+ from pathlib import Path
2628
2729LOGGER = get_logger (name = "export-records" )
2830
2931def scrape_record (record ):
3032 root , _ = parse_tei (record , get_ns = True )
31-
33+ # Get protocol metadata
34+ record_id = root .attrib [f"{ XML_NS } id" ]
35+ metadata = infer_metadata (record_id )
36+ metadata ["record" ] = record_id
37+
38+ for front in root .findall (f".//{ TEI_NS } front" ):
39+ for docDate in front .findall (f".//{ TEI_NS } docDate" ):
40+ date = docDate .attrib ["when" ]
41+ if metadata .get ("start_date" ) is None :
42+ metadata ["start_date" ] = date
43+ metadata ["end_date" ] = date
44+
45+ if metadata .get ("start_date" ) > date :
46+ metadata ["start_date" ] = date
47+ if metadata .get ("end_date" ) < date :
48+ metadata ["end_date" ] = date
49+
50+
51+ # Get speeches
3252 speeches = {}
3353 all_u_ids = set ()
34- # Add IDs for divs
35- record_id = root .attrib [f"{ XML_NS } id" ]
3654 for textDesc in root .findall (f".//{ TEI_NS } textDesc" ):
3755 for constitution in textDesc .findall (f".//{ TEI_NS } constitution" ):
3856 speech_index = 0
@@ -51,7 +69,7 @@ def scrape_record(record):
5169 speech_index += 1
5270
5371 if len (speeches ) == 0 :
54- return None
72+ return None , metadata
5573
5674 for u in root .findall (f".//{ TEI_NS } u" ):
5775 u_id = u .attrib [f"{ XML_NS } id" ]
@@ -84,14 +102,16 @@ def scrape_record(record):
84102
85103 df = pl .DataFrame (speech_list )
86104 df = df .select ("speech" , "record" , "ix" , "who" , "text" )
87- return df
105+ return df , metadata
88106
89107
90108def main (args ):
91109 protocols = args .records
92110 all_dfs = []
111+ record_metadata = []
93112 for record in tqdm (args .records ):
94- df = scrape_record (record )
113+ df , metadata = scrape_record (record )
114+ record_metadata .append (metadata )
95115 if df is None :
96116 LOGGER .error (f"No speeches in { record } " )
97117 else :
@@ -102,12 +122,30 @@ def main(args):
102122 df = df .select ("speech" , "record" , "who" , "text" )
103123 print (df )
104124
105- df .write_database (
106- table_name = "records_speeches" ,
107- connection = "sqlite:///records_speeches.sqlite" ,
108- )
125+ metadata_df = pl .DataFrame (record_metadata )
126+ metadata_df = metadata_df .select ("record" , "sitting" , "chamber" , "number" , "start_date" , "end_date" )
127+ metadata_df = metadata_df .sort ("sitting" , "chamber" , "number" )
128+ print (metadata_df .columns )
129+
130+ if "sqlite" in args .formats :
131+ LOGGER .train ("Export to sqlite" )
132+ if Path ("records.sqlite" ).exists ():
133+ Path ("records.sqlite" ).unlink ()
134+ df .write_database (
135+ table_name = "speeches" ,
136+ connection = "sqlite:///records.sqlite" ,
137+ )
138+ metadata_df .write_database (
139+ table_name = "records" ,
140+ connection = "sqlite:///records.sqlite" ,
141+ )
142+
143+ if "ndjson" in args .formats :
144+ LOGGER .train ("Export to ndjson" )
145+ df .write_ndjson ("records_speeches.ndjson" )
109146
110147if __name__ == "__main__" :
111148 parser = fetch_parser ("records" )
149+ parser .add_argument ("--formats" , type = str , default = ["sqlite" , "ndjson" ])
112150 args = impute_args (parser .parse_args ())
113151 main (args )
0 commit comments