From bef6f0af839643e0519bc80bc439450b57b8cba8 Mon Sep 17 00:00:00 2001 From: fritzrehde Date: Wed, 10 Sep 2025 16:35:52 +1000 Subject: [PATCH 1/5] refactor class parsing error handling --- src/course_scraper.rs | 112 ++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 52 deletions(-) diff --git a/src/course_scraper.rs b/src/course_scraper.rs index a0e2b62..abfbc82 100644 --- a/src/course_scraper.rs +++ b/src/course_scraper.rs @@ -226,69 +226,77 @@ fn parse_class_info( map.insert(key, value); i += 2; } - let offering_period_str = map.get("Offering Period").unwrap_or(&"").to_string(); + + let missing_field_error = |missing_field_name: &str| { + anyhow::anyhow!(format!( + "{} for course {} is missing", + missing_field_name, course_id + )) + }; + let get_expected_field = |field_name: &str| { + map.get(field_name) + .ok_or_else(|| missing_field_error(field_name)) + }; + + let offering_period_str = get_expected_field("Offering Period")?; let mut split_offering_period_str = offering_period_str.split(" - "); - let error_msg = format!("failed to parse a class for course {}", course_id); + let section = get_expected_field("Section")?; + let date = split_offering_period_str .next() - .ok_or_else(|| anyhow::anyhow!(error_msg.clone()))?; + .ok_or_else(|| missing_field_error("date"))?; let year = date .split("/") .nth(2) - .ok_or_else(|| anyhow::anyhow!(error_msg.clone()))?; + .ok_or_else(|| missing_field_error("year"))?; + let class_nr = get_expected_field("Class Nbr")?; + let term = get_expected_field("Teaching Period")? + .split(" - ") + .next() + .ok_or_else(|| { + anyhow::anyhow!(format!( + "failed to parse term from teaching period for course {}", + course_id + )) + })?; + + let class_id = format!("{}-{}-{}-{}", course_id, class_nr, term, year); + let activity = get_expected_field("Activity")?; + let status = get_expected_field("Status")?; + let course_enrolment = get_expected_field("Enrols/Capacity")?.replace("*", ""); + let offering_period = get_expected_field("Offering Period")?; + let meeting_dates = get_expected_field("Meeting Dates")?; + let census_date = get_expected_field("Census Date")?; + let mode = get_expected_field("Mode of Delivery")?; + let consent = get_expected_field("Consent")?; + let times = if !times_parsed.is_empty() { + Some(times_parsed) + } else { + None + }; + let class_notes = map + .get("Class Notes") + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()); + Ok(Class { course_id: course_id.to_string(), - class_id: format!( - "{}-{}-{}-{}", - course_id, - map.get("Class Nbr").unwrap_or(&""), - map.get("Teaching Period") - .unwrap_or(&"") - .split(" - ") - .next() - .ok_or_else(|| anyhow::anyhow!(format!( - "{}: {}", - &error_msg, "Could not split teaching periods properly!" - )))?, - year, - ), - section: map.get("Section").unwrap_or(&"").to_string(), - term: map - .get("Teaching Period") - .unwrap_or(&"") - .split(" - ") - .next() - .ok_or_else(|| { - anyhow::anyhow!(format!( - "{}: {}", - &error_msg, "Could not split teaching periods properly!" - )) - })? - .to_string(), + class_id, + section: section.to_string(), + term: term.to_string(), year: year.to_string(), - activity: map.get("Activity").unwrap_or(&"").to_string(), - status: map.get("Status").unwrap_or(&"").to_string(), - course_enrolment: map - .get("Enrols/Capacity") - .unwrap_or(&"") - .replace("*", "") - .to_string(), - offering_period: map.get("Offering Period").unwrap_or(&"").to_string(), - meeting_dates: map.get("Meeting Dates").unwrap_or(&"").to_string(), - census_date: map.get("Census Date").unwrap_or(&"").to_string(), - mode: map.get("Mode of Delivery").unwrap_or(&"").to_string(), - consent: map.get("Consent").unwrap_or(&"").to_string(), + activity: activity.to_string(), + status: status.to_string(), + course_enrolment, + offering_period: offering_period.to_string(), + meeting_dates: meeting_dates.to_string(), + census_date: census_date.to_string(), + mode: mode.to_string(), + consent: consent.to_string(), career: career.to_string(), - times: if times_parsed.is_empty() { - None - } else { - Some(times_parsed) - }, - class_notes: map - .get("Class Notes") - .filter(|s| !s.is_empty()) - .map(|s| s.to_string()), + times, + class_notes, }) } From 7dfe2c4c40cb9112f35cfa0798895ea175a550e9 Mon Sep 17 00:00:00 2001 From: fritzrehde Date: Thu, 11 Sep 2025 14:26:51 +1000 Subject: [PATCH 2/5] implemented revised course class and time ids --- src/course_scraper.rs | 137 +++++++++++++++++++++++++----------- src/subject_area_scraper.rs | 1 + 2 files changed, 95 insertions(+), 43 deletions(-) diff --git a/src/course_scraper.rs b/src/course_scraper.rs index abfbc82..015b8f3 100644 --- a/src/course_scraper.rs +++ b/src/course_scraper.rs @@ -4,18 +4,18 @@ use scraper::Selector; use serde::Serialize; use std::collections::{HashMap, HashSet}; -use crate::{ScrapingContext, text_manipulators::extract_text}; +use crate::{ScrapingContext, Year, text_manipulators::extract_text}; #[derive(Debug, Serialize)] pub struct Course { pub course_id: String, pub course_code: String, + pub year: Year, pub course_name: String, pub uoc: i32, - // TODO: try making non-optional. pub faculty: Option, pub school: Option, - pub career: Option, + pub career: String, // Sorted ascendingly. pub modes: Vec, // For Notangles. pub campus: Option, @@ -28,6 +28,7 @@ pub struct Class { pub course_id: String, pub career: String, pub class_id: String, + pub class_nr: String, pub section: String, pub term: String, pub year: String, @@ -45,10 +46,20 @@ pub struct Class { #[derive(Debug, Serialize)] pub struct Time { + pub time_id: String, pub career: String, + pub location: String, pub day: String, pub time: String, + pub weeks: String, + pub instructor: Option, +} + +#[derive(Debug, Serialize)] +pub struct PartialTime { pub location: String, + pub day: String, + pub time: String, pub weeks: String, pub instructor: Option, } @@ -60,6 +71,7 @@ pub struct PartialCourse { pub career: String, pub uoc: i32, pub url: String, + pub year: Year, } impl PartialCourse { @@ -162,14 +174,18 @@ impl PartialCourse { } } - let course_id = format!("{}{}", &self.course_code, career); + // The reason we aren't including the term in the course id is that the handbook only + // contains one page per course per year, which contains data for the course that year. + // We use the same format to reduce duplicated data. + let course_id = format!("{}-{}-{}", self.course_code, career, self.year); + let course_code = self.course_code; let course_name = self.course_name; let uoc = self.uoc; let classes: Vec = class_activity_information .into_par_iter() - .map(|class_data| parse_class_info(class_data, course_id.as_str(), career.as_ref())) + .map(|class_data| parse_class_info(class_data, &course_id, &course_code, &career)) .collect::>()?; let unique_modes: HashSet<&String> = classes.iter().map(|class| &class.mode).collect(); @@ -180,12 +196,13 @@ impl PartialCourse { Ok(Course { course_id, course_code, + year: self.year, course_name, uoc, faculty, school, campus, - career: Some(career), + career, modes, terms, classes, @@ -200,11 +217,12 @@ impl PartialCourse { fn parse_class_info( class_data: Vec, course_id: &str, - career: &str, + course_code: &str, + course_career: &str, ) -> anyhow::Result { let mut map: HashMap<&str, &str> = HashMap::new(); let mut i = 0; - let mut times_parsed = Vec::