Skip to content

Commit d3891c2

Browse files
committed
fix: cast who to String so that it wont be null
1 parent 8728f96 commit d3891c2

File tree

1 file changed

+15
-4
lines changed

1 file changed

+15
-4
lines changed

src/export_records.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,11 @@ def scrape_record(record):
100100
speech_dict["speech"] = speech_id
101101
speech_list.append(speech_dict)
102102

103-
df = pl.DataFrame(speech_list)
103+
df = pl.DataFrame(speech_list, infer_schema_length=None)
104104
df = df.select("speech", "record", "ix", "who", "text")
105+
106+
# Make sure who is pl.String in case all who's happen to be null
107+
df = df.with_columns(pl.col("who").cast(pl.String))
105108
return df, metadata
106109

107110

@@ -120,12 +123,10 @@ def main(args):
120123
df = pl.concat(all_dfs)
121124
df = df.sort("record", "ix")
122125
df = df.select("speech", "record", "who", "text")
123-
print(df)
124126

125127
metadata_df = pl.DataFrame(record_metadata)
126128
metadata_df = metadata_df.select("record", "sitting", "chamber", "number", "start_date", "end_date")
127129
metadata_df = metadata_df.sort("sitting", "chamber", "number")
128-
print(metadata_df.columns)
129130

130131
if "sqlite" in args.formats:
131132
LOGGER.train("Export to sqlite")
@@ -140,9 +141,19 @@ def main(args):
140141
connection="sqlite:///records.sqlite",
141142
)
142143

144+
# Flattened formats
145+
df = df.join(metadata_df, on="record")
146+
df.sort("sitting", "chamber", "number")
147+
df = df.with_columns(pl.col("sitting").str.head(3).alias("decade"))
148+
143149
if "ndjson" in args.formats:
144150
LOGGER.train("Export to ndjson")
145-
df.write_ndjson("records_speeches.ndjson")
151+
for decade in sorted(set(df["decade"])):
152+
df_decade = df.filter(pl.col("decade") == decade)
153+
df_decade_columns = [col for col in df_decade.columns if col != "decade"]
154+
df_decade = df_decade.select(df_decade_columns)
155+
LOGGER.info(f"{decade}:\ndf_decade")
156+
df_decade.write_ndjson(f"records_speeches_{decade}0s.ndjson")
146157

147158
if __name__ == "__main__":
148159
parser = fetch_parser("records")

0 commit comments

Comments
 (0)