@@ -100,8 +100,11 @@ def scrape_record(record):
100100 speech_dict ["speech" ] = speech_id
101101 speech_list .append (speech_dict )
102102
103- df = pl .DataFrame (speech_list )
103+ df = pl .DataFrame (speech_list , infer_schema_length = None )
104104 df = df .select ("speech" , "record" , "ix" , "who" , "text" )
105+
106+ # Make sure who is pl.String in case all who's happen to be null
107+ df = df .with_columns (pl .col ("who" ).cast (pl .String ))
105108 return df , metadata
106109
107110
@@ -120,12 +123,10 @@ def main(args):
120123 df = pl .concat (all_dfs )
121124 df = df .sort ("record" , "ix" )
122125 df = df .select ("speech" , "record" , "who" , "text" )
123- print (df )
124126
125127 metadata_df = pl .DataFrame (record_metadata )
126128 metadata_df = metadata_df .select ("record" , "sitting" , "chamber" , "number" , "start_date" , "end_date" )
127129 metadata_df = metadata_df .sort ("sitting" , "chamber" , "number" )
128- print (metadata_df .columns )
129130
130131 if "sqlite" in args .formats :
131132 LOGGER .train ("Export to sqlite" )
@@ -140,9 +141,19 @@ def main(args):
140141 connection = "sqlite:///records.sqlite" ,
141142 )
142143
144+ # Flattened formats
145+ df = df .join (metadata_df , on = "record" )
146+ df .sort ("sitting" , "chamber" , "number" )
147+ df = df .with_columns (pl .col ("sitting" ).str .head (3 ).alias ("decade" ))
148+
143149 if "ndjson" in args .formats :
144150 LOGGER .train ("Export to ndjson" )
145- df .write_ndjson ("records_speeches.ndjson" )
151+ for decade in sorted (set (df ["decade" ])):
152+ df_decade = df .filter (pl .col ("decade" ) == decade )
153+ df_decade_columns = [col for col in df_decade .columns if col != "decade" ]
154+ df_decade = df_decade .select (df_decade_columns )
155+ LOGGER .info (f"{ decade } :\n df_decade" )
156+ df_decade .write_ndjson (f"records_speeches_{ decade } 0s.ndjson" )
146157
147158if __name__ == "__main__" :
148159 parser = fetch_parser ("records" )
0 commit comments