Skip to content

Commit 6dc7598

Browse files
kuedaclaude
andcommitted
fix: avoid writing correupted zips with duplicate media files
Co-authored-by: Claude (claude-sonnet-4-6) <noreply@anthropic.com>
1 parent a73fc22 commit 6dc7598

2 files changed

Lines changed: 142 additions & 6 deletions

File tree

chuck-core/src/downloader.rs

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,12 @@ impl Downloader {
128128
let mut progress = DownloadProgress::default();
129129
let mut cumulative_media_seen: usize = 0;
130130

131+
// Track photo/sound IDs already committed to the ZIP so that photos shared
132+
// across observations (and therefore present in multiple API pages) are not
133+
// written twice, which would produce an invalid archive.
134+
let mut added_photo_ids: std::collections::HashSet<i32> = std::collections::HashSet::new();
135+
let mut added_sound_ids: std::collections::HashSet<i32> = std::collections::HashSet::new();
136+
131137
// Track pending media download from previous batch
132138
#[allow(clippy::type_complexity)]
133139
let mut pending_media: Option<(
@@ -184,9 +190,10 @@ impl Downloader {
184190
self.process_extensions(
185191
&observations, &mut archive, &photo_mapping, &sound_mapping, &taxa_hash
186192
).await?;
187-
for rel_path in photo_mapping.values().chain(sound_mapping.values()) {
188-
archive.add_media_from_temp(rel_path)?;
189-
}
193+
commit_media(
194+
&mut archive, &photo_mapping, &sound_mapping,
195+
&mut added_photo_ids, &mut added_sound_ids,
196+
)?;
190197
}
191198
break;
192199
}
@@ -241,9 +248,10 @@ impl Downloader {
241248
self.process_extensions(
242249
&observations, &mut archive, &photo_mapping, &sound_mapping, &prev_taxa_hash
243250
).await?;
244-
for rel_path in photo_mapping.values().chain(sound_mapping.values()) {
245-
archive.add_media_from_temp(rel_path)?;
246-
}
251+
commit_media(
252+
&mut archive, &photo_mapping, &sound_mapping,
253+
&mut added_photo_ids, &mut added_sound_ids,
254+
)?;
247255
}
248256

249257
// Start media downloads for current batch in background
@@ -507,6 +515,35 @@ impl Downloader {
507515
}
508516
}
509517

518+
/// Add media files from a batch to the archive, skipping any photo or sound IDs already
519+
/// committed in a previous batch. For skipped duplicates, removes the re-downloaded file
520+
/// without writing it to the ZIP.
521+
fn commit_media(
522+
archive: &mut crate::darwin_core::ArchiveBuilder,
523+
photo_mapping: &HashMap<i32, String>,
524+
sound_mapping: &HashMap<i32, String>,
525+
added_photo_ids: &mut std::collections::HashSet<i32>,
526+
added_sound_ids: &mut std::collections::HashSet<i32>,
527+
) -> Result<(), Box<dyn std::error::Error>> {
528+
for (id, rel_path) in photo_mapping {
529+
if added_photo_ids.insert(*id) {
530+
archive.add_media_from_temp(rel_path)?;
531+
} else {
532+
let local = archive.media_dir().join(rel_path.replace('\\', "/"));
533+
let _ = std::fs::remove_file(local);
534+
}
535+
}
536+
for (id, rel_path) in sound_mapping {
537+
if added_sound_ids.insert(*id) {
538+
archive.add_media_from_temp(rel_path)?;
539+
} else {
540+
let local = archive.media_dir().join(rel_path.replace('\\', "/"));
541+
let _ = std::fs::remove_file(local);
542+
}
543+
}
544+
Ok(())
545+
}
546+
510547
/// Compute an updated photo count estimate using a running average across observed batches.
511548
/// Returns the new estimate, but never less than `current_estimate` (never decreases).
512549
pub fn update_photo_estimate(

chuck-core/tests/downloader_integration_test.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,3 +946,102 @@ async fn test_downloader_downloads_sound_files() {
946946
"multimedia.csv should reference local sound file"
947947
);
948948
}
949+
950+
#[tokio::test]
951+
#[serial]
952+
async fn test_downloader_no_duplicate_zip_entry_when_photo_appears_in_two_batches() {
953+
// Regression: when the same photo_id appears in two different API pages (e.g. the same
954+
// photo is attached to two different observations), the downloader must not write it to
955+
// the ZIP twice. A duplicate ZIP entry causes `ZipArchive::new` to fail with
956+
// "Invalid Zip archive: Duplicate filename".
957+
let server = MockServer::start();
958+
959+
let config = chuck_core::api::client::create_config_with_base_url_and_jwt(
960+
server.base_url(),
961+
Some("test_jwt".to_string()),
962+
);
963+
964+
// Page 3: empty, stops pagination
965+
let _observations_page3_mock = server.mock(|when, then| {
966+
when.method(GET)
967+
.path("/observations")
968+
.query_param("id_below", "200");
969+
then.status(200)
970+
.header("content-type", "application/json")
971+
.json_body(observations_response_json(2, vec![]));
972+
});
973+
974+
// Page 2: observation 200 — same photo_id 9999 as page 1
975+
let _observations_page2_mock = server.mock(|when, then| {
976+
when.method(GET)
977+
.path("/observations")
978+
.query_param("id_below", "300");
979+
then.status(200)
980+
.header("content-type", "application/json")
981+
.json_body(observations_response_json(
982+
2,
983+
vec![observation_json(200, &server.base_url(), &[9999])],
984+
));
985+
});
986+
987+
// Page 1: observation 300 with photo_id 9999
988+
let _observations_page1_mock = server.mock(|when, then| {
989+
when.method(GET).path("/observations");
990+
then.status(200)
991+
.header("content-type", "application/json")
992+
.json_body(observations_response_json(
993+
2,
994+
vec![observation_json(300, &server.base_url(), &[9999])],
995+
));
996+
});
997+
998+
let _taxa_mock = server.mock(|when, then| {
999+
when.method(GET).path_contains("/taxa");
1000+
then.status(200)
1001+
.header("content-type", "application/json")
1002+
.json_body(taxa_response_json());
1003+
});
1004+
1005+
// Photo 9999 will be requested once per batch — twice total
1006+
let _photo_mock = server.mock(|when, then| {
1007+
when.method(GET).path("/photo9999.jpg");
1008+
then.status(200)
1009+
.header("content-type", "image/jpeg")
1010+
.body(MINIMAL_PNG);
1011+
});
1012+
1013+
let temp_dir = TempDir::new().unwrap();
1014+
let output_path = temp_dir.path().join("test.zip");
1015+
1016+
let params = observations_api::ObservationsGetParams {
1017+
taxon_id: Some(vec!["47126".to_string()]),
1018+
per_page: Some("1".to_string()),
1019+
..chuck_core::api::params::DEFAULT_GET_PARAMS.clone()
1020+
};
1021+
1022+
let downloader = Downloader::with_config(
1023+
params,
1024+
vec![DwcaExtension::SimpleMultimedia],
1025+
true,
1026+
config,
1027+
);
1028+
1029+
let result = downloader.execute(output_path.to_str().unwrap(), |_| {}, None).await;
1030+
assert!(result.is_ok(), "Download should succeed: {:?}", result.err());
1031+
1032+
// Opening the archive must not fail with "Duplicate filename"
1033+
let zip_data = std::fs::read(&output_path).unwrap();
1034+
let mut zip = zip::ZipArchive::new(std::io::Cursor::new(zip_data))
1035+
.expect("ZipArchive::new should succeed — no duplicate entries");
1036+
1037+
let photo_entries: Vec<_> = (0..zip.len())
1038+
.map(|i| zip.by_index(i).unwrap().name().to_string())
1039+
.filter(|n| n.contains("9999"))
1040+
.collect();
1041+
1042+
assert_eq!(
1043+
photo_entries.len(),
1044+
1,
1045+
"expected exactly one ZIP entry for photo 9999, got: {photo_entries:?}"
1046+
);
1047+
}

0 commit comments

Comments
 (0)