Skip to content

Commit 60c1cd2

Browse files
kuedaclaude
andauthored
refactor: add media to zip during download process (#47)
And not in the finalizing step. Hopefully this means "finalizing" is close to instantaneous. Co-authored-by: Claude (claude-sonnet-4-6) <noreply@anthropic.com>
1 parent 5d1a534 commit 60c1cd2

4 files changed

Lines changed: 129 additions & 124 deletions

File tree

chuck-core/src/archive_updater.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -821,15 +821,15 @@ mod tests {
821821
use crate::darwin_core::meta::Metadata;
822822
use crate::darwin_core::archive::ArchiveBuilder;
823823

824+
let tmp = tempfile::NamedTempFile::new().unwrap();
824825
let metadata = Metadata::default();
825826
let builder = ArchiveBuilder::new(
826827
vec![DwcaExtension::SimpleMultimedia, DwcaExtension::Identifications],
827828
metadata,
828-
&std::env::temp_dir(),
829+
tmp.path(),
829830
).unwrap();
830-
let tmp = tempfile::NamedTempFile::new().unwrap();
831831
let path = tmp.path().to_str().unwrap().to_string();
832-
builder.build(&path).await.unwrap();
832+
builder.build().await.unwrap();
833833

834834
let exts = infer_extensions(&path).unwrap();
835835
assert!(exts.contains(&DwcaExtension::SimpleMultimedia));
@@ -843,11 +843,11 @@ mod tests {
843843
use crate::darwin_core::meta::Metadata;
844844
use crate::darwin_core::archive::ArchiveBuilder;
845845

846-
let metadata = Metadata::default();
847-
let builder = ArchiveBuilder::new(vec![], metadata, &std::env::temp_dir()).unwrap();
848846
let tmp = tempfile::NamedTempFile::new().unwrap();
847+
let metadata = Metadata::default();
848+
let builder = ArchiveBuilder::new(vec![], metadata, tmp.path()).unwrap();
849849
let path = tmp.path().to_str().unwrap().to_string();
850-
builder.build(&path).await.unwrap();
850+
builder.build().await.unwrap();
851851

852852
assert!(!archive_has_media(&path).unwrap());
853853
}

chuck-core/src/chuck_metadata.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,10 @@ mod tests {
5555
use crate::darwin_core::archive::ArchiveBuilder;
5656

5757
async fn build_archive(inat_query: Option<String>) -> tempfile::NamedTempFile {
58-
let metadata = Metadata { inat_query, ..Default::default() };
59-
let builder = ArchiveBuilder::new(vec![], metadata, &std::env::temp_dir()).unwrap();
6058
let tmp = tempfile::NamedTempFile::new().unwrap();
61-
builder.build(tmp.path().to_str().unwrap()).await.unwrap();
59+
let metadata = Metadata { inat_query, ..Default::default() };
60+
let builder = ArchiveBuilder::new(vec![], metadata, tmp.path()).unwrap();
61+
builder.build().await.unwrap();
6262
tmp
6363
}
6464

chuck-core/src/darwin_core/archive.rs

Lines changed: 109 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::fs::File;
22
use std::io::Write;
3-
use std::path::{PathBuf};
3+
use std::path::{Path, PathBuf};
44
use tempfile::TempDir;
55
use zip::write::{FileOptions, ZipWriter};
66
use zip::CompressionMethod;
@@ -16,6 +16,9 @@ use crate::darwin_core::{
1616
/// A DarwinCore Archive builder that can stream occurrence records and generate a compliant ZIP archive
1717
pub struct ArchiveBuilder {
1818
temp_dir: TempDir,
19+
zip: ZipWriter<File>,
20+
/// The final destination path; the ZIP is written to a temp file and renamed here on success.
21+
output_path: PathBuf,
1922
occurrence_writer: csv::Writer<File>,
2023
multimedia_writer: Option<csv::Writer<File>>,
2124
audiovisual_writer: Option<csv::Writer<File>>,
@@ -32,30 +35,37 @@ pub struct ArchiveBuilder {
3235
audiovisual_file_path: PathBuf,
3336
identification_file_path: PathBuf,
3437
comment_file_path: PathBuf,
35-
media_dir_path: PathBuf,
3638
metadata: Metadata,
3739
}
3840

3941
impl ArchiveBuilder {
4042
/// Create a new DarwinCore Archive builder.
41-
/// `base_dir` is the directory in which the temporary working directory is created;
42-
/// pass the parent of the output ZIP so temp files land on the same filesystem.
43+
/// Opens the output ZIP file immediately; pass the final output path so that
44+
/// temp files land on the same filesystem (important on Linux where /tmp may be tmpfs).
4345
pub fn new(
4446
dwc_extensions: Vec<crate::DwcaExtension>,
4547
metadata: Metadata,
46-
base_dir: &std::path::Path,
48+
output_path: &Path,
4749
) -> Result<Self, Box<dyn std::error::Error>> {
50+
let base_dir = output_path.parent().unwrap_or(Path::new("."));
4851
let temp_dir = TempDir::new_in(base_dir)?;
4952
let occurrence_file_path = temp_dir.path().join("occurrence.csv");
5053
let multimedia_file_path = temp_dir.path().join("multimedia.csv");
5154
let audiovisual_file_path = temp_dir.path().join("audiovisual.csv");
5255
let identification_file_path = temp_dir.path().join("identification.csv");
5356
let comment_file_path = temp_dir.path().join("comment.csv");
54-
let media_dir_path = temp_dir.path().join("media");
5557

56-
// Create media directory
58+
// Create media staging directory inside temp dir
59+
let media_dir_path = temp_dir.path().join("media");
5760
std::fs::create_dir_all(&media_dir_path)?;
5861

62+
// Write the ZIP to a temp file inside the temp dir so that a cancelled or failed
63+
// download leaves no partial file at the final output path. On successful build()
64+
// the temp ZIP is renamed to output_path (same filesystem → atomic rename).
65+
let zip_temp_path = temp_dir.path().join("archive.zip");
66+
let zip_file = File::create(&zip_temp_path)?;
67+
let zip = ZipWriter::new(zip_file);
68+
5969
// Create CSV writer for occurrence records
6070
let occurrence_file = File::create(&occurrence_file_path)?;
6171
let mut occurrence_writer = csv::WriterBuilder::new()
@@ -68,6 +78,8 @@ impl ArchiveBuilder {
6878

6979
Ok(Self {
7080
temp_dir,
81+
zip,
82+
output_path: output_path.to_path_buf(),
7183
occurrence_writer,
7284
multimedia_writer: None,
7385
audiovisual_writer: None,
@@ -84,14 +96,34 @@ impl ArchiveBuilder {
8496
audiovisual_file_path,
8597
identification_file_path,
8698
comment_file_path,
87-
media_dir_path,
8899
metadata,
89100
})
90101
}
91102

92-
/// Get the media directory path for downloading photos
93-
pub fn media_dir(&self) -> &std::path::Path {
94-
&self.media_dir_path
103+
/// Get the media staging directory path for downloading files before adding to the ZIP.
104+
pub fn media_dir(&self) -> PathBuf {
105+
self.temp_dir.path().join("media")
106+
}
107+
108+
/// Stream a staged media file into the open ZIP and remove it from the staging directory.
109+
/// `rel_zip_path` is the path as it should appear in the ZIP (e.g. `"media/2024/01/15/12345.jpg"`).
110+
/// The file must exist at `temp_dir / rel_zip_path`.
111+
pub fn add_media_from_temp(
112+
&mut self,
113+
rel_zip_path: &str,
114+
) -> Result<(), Box<dyn std::error::Error>> {
115+
let local_path = self.temp_dir.path().join(rel_zip_path);
116+
if !local_path.exists() {
117+
return Ok(());
118+
}
119+
let zip_opts: FileOptions<()> = FileOptions::default()
120+
.compression_method(CompressionMethod::Stored)
121+
.unix_permissions(0o644);
122+
self.zip.start_file(rel_zip_path, zip_opts)?;
123+
let mut file = File::open(&local_path)?;
124+
std::io::copy(&mut file, &mut self.zip)?;
125+
std::fs::remove_file(&local_path)?;
126+
Ok(())
95127
}
96128

97129
/// Add a batch of DarwinCore occurrences to the archive
@@ -237,8 +269,9 @@ impl ArchiveBuilder {
237269
Ok(())
238270
}
239271

240-
/// Build the final archive and create the ZIP file
241-
pub async fn build(mut self, output_path: &str) -> Result<(), Box<dyn std::error::Error>> {
272+
/// Finish writing the archive. All media must have been added via `add_media_from_temp`
273+
/// before calling this; only CSV files and metadata are written here.
274+
pub async fn build(mut self) -> Result<(), Box<dyn std::error::Error>> {
242275
// Ensure all CSV data is written
243276
self.occurrence_writer.flush()?;
244277
drop(self.occurrence_writer); // Close the file
@@ -277,34 +310,31 @@ impl ArchiveBuilder {
277310
let eml_file_path = self.temp_dir.path().join("eml.xml");
278311
std::fs::write(&eml_file_path, eml_xml)?;
279312

280-
// Create ZIP archive
281-
let zip_file = File::create(output_path)?;
282-
let mut zip = ZipWriter::new(zip_file);
283313
let options: FileOptions<()> = FileOptions::default()
284314
.compression_method(CompressionMethod::Deflated)
285315
.unix_permissions(0o644);
286316

287317
// Add meta.xml to ZIP
288-
zip.start_file("meta.xml", options)?;
318+
self.zip.start_file("meta.xml", options)?;
289319
let meta_content = std::fs::read(&meta_file_path)?;
290-
zip.write_all(&meta_content)?;
320+
self.zip.write_all(&meta_content)?;
291321

292322
// Add eml.xml to ZIP
293-
zip.start_file("eml.xml", options)?;
323+
self.zip.start_file("eml.xml", options)?;
294324
let eml_content = std::fs::read(&eml_file_path)?;
295-
zip.write_all(&eml_content)?;
325+
self.zip.write_all(&eml_content)?;
296326

297327
// Add chuck.json if inat_query is set
298328
if let Some(ref inat_query) = self.metadata.inat_query {
299329
let chuck_json = serde_json::json!({ "inat_query": inat_query }).to_string();
300-
zip.start_file("chuck.json", options)?;
301-
zip.write_all(chuck_json.as_bytes())?;
330+
self.zip.start_file("chuck.json", options)?;
331+
self.zip.write_all(chuck_json.as_bytes())?;
302332
}
303333

304334
// Add occurrence.csv to ZIP
305-
zip.start_file("occurrence.csv", options)?;
335+
self.zip.start_file("occurrence.csv", options)?;
306336
let occurrence_content = std::fs::read(&self.occurrence_file_path)?;
307-
zip.write_all(&occurrence_content)?;
337+
self.zip.write_all(&occurrence_content)?;
308338

309339
// Add extension CSVs to ZIP for all enabled extensions, even if empty
310340
let ext_specs: &[(crate::DwcaExtension, &str, &std::path::Path, Vec<&str>)] = &[
@@ -346,75 +376,24 @@ impl ArchiveBuilder {
346376
wtr.write_record(headers)?;
347377
wtr.flush()?;
348378
}
349-
zip.start_file(*zip_name, options)?;
350-
zip.write_all(&std::fs::read(file_path)?)?;
351-
}
352-
353-
// Add media directory contents to ZIP if it exists and has files
354-
println!("Zipping media...");
355-
Self::add_directory_to_zip(&mut zip, &self.media_dir_path, "media")?;
356-
357-
// Finish ZIP
358-
zip.finish()?;
359-
360-
println!("DarwinCore Archive created: {output_path}");
361-
println!("Records exported: {}", self.record_count);
362-
if self.multimedia_count > 0 {
363-
println!("Multimedia records exported: {}", self.multimedia_count);
379+
self.zip.start_file(*zip_name, options)?;
380+
self.zip.write_all(&std::fs::read(file_path)?)?;
364381
}
365-
if self.audiovisual_count > 0 {
366-
println!("Audiovisual records exported: {}", self.audiovisual_count);
367-
}
368-
if self.identification_count > 0 {
369-
println!("Identification records exported: {}", self.identification_count);
370-
}
371-
if self.comment_count > 0 {
372-
println!("Comment records exported: {}", self.comment_count);
373-
}
374-
375-
Ok(())
376-
}
377382

378-
/// Add directory contents to ZIP archive using streaming to avoid loading entire files into memory
379-
fn add_directory_to_zip(
380-
zip: &mut ZipWriter<File>,
381-
dir_path: &std::path::Path,
382-
zip_prefix: &str,
383-
) -> Result<(), Box<dyn std::error::Error>> {
384-
if !dir_path.exists() || !dir_path.is_dir() {
385-
return Ok(());
386-
}
383+
// Finish ZIP (writes central directory)
384+
let zip_temp_path = self.temp_dir.path().join("archive.zip");
385+
self.zip.finish()?;
387386

388-
// There's no point in trying to compress JPGs
389-
let zip_opts: FileOptions<()> = FileOptions::default()
390-
.compression_method(CompressionMethod::Stored)
391-
.unix_permissions(0o644);
387+
// Rename the temp ZIP to the final output path. Both are on the same filesystem
388+
// so this is an atomic rename on most systems.
389+
std::fs::rename(&zip_temp_path, &self.output_path)?;
392390

393-
for entry in std::fs::read_dir(dir_path)? {
394-
let entry = entry?;
395-
let path = entry.path();
396-
397-
if path.is_file() {
398-
let file_name = path.file_name()
399-
.and_then(|name| name.to_str())
400-
.ok_or("Invalid filename")?;
401-
402-
let zip_path = format!("{zip_prefix}/{file_name}");
403-
zip.start_file(zip_path, zip_opts)?;
404-
405-
// Stream the file contents instead of reading entirely into memory
406-
let mut file = File::open(&path)?;
407-
std::io::copy(&mut file, zip)?;
408-
} else if path.is_dir() {
409-
// Recursively add subdirectory contents
410-
let dir_name = path.file_name()
411-
.and_then(|name| name.to_str())
412-
.ok_or("Invalid directory name")?;
413-
414-
let subdir_zip_prefix = format!("{zip_prefix}/{dir_name}");
415-
Self::add_directory_to_zip(zip, &path, &subdir_zip_prefix)?;
416-
}
417-
}
391+
log::info!(
392+
"DarwinCore Archive complete: {} records, {} multimedia, {} audiovisual, \
393+
{} identifications, {} comments",
394+
self.record_count, self.multimedia_count, self.audiovisual_count,
395+
self.identification_count, self.comment_count,
396+
);
418397

419398
Ok(())
420399
}
@@ -428,28 +407,53 @@ mod tests {
428407
use zip::ZipArchive;
429408

430409
#[test]
431-
fn test_archive_builder_temp_dir_in_base_dir() {
432-
let base = tempfile::TempDir::new().unwrap();
433-
let metadata = Metadata::default();
434-
let builder = ArchiveBuilder::new(vec![], metadata, base.path()).unwrap();
435-
// media_dir lives inside temp_dir, so it must be under base
410+
fn test_archive_temp_files_in_same_dir_as_output() {
411+
let output_dir = tempfile::TempDir::new().unwrap();
412+
let output_path = output_dir.path().join("test.zip");
413+
let builder = ArchiveBuilder::new(vec![], Metadata::default(), &output_path).unwrap();
414+
// media_dir lives inside temp_dir, which must be under the output's parent dir
436415
assert!(
437-
builder.media_dir().starts_with(base.path()),
416+
builder.media_dir().starts_with(output_dir.path()),
438417
"expected media_dir {:?} to be under {:?}",
439418
builder.media_dir(),
440-
base.path()
419+
output_dir.path()
441420
);
442421
}
443422

423+
#[tokio::test]
424+
async fn test_add_media_from_temp_adds_to_zip_and_removes_local_file() {
425+
let tmp = tempfile::NamedTempFile::new().unwrap();
426+
let mut builder = ArchiveBuilder::new(vec![], Metadata::default(), tmp.path()).unwrap();
427+
428+
// Create a fake media file in the staging dir
429+
let media_dir = builder.media_dir();
430+
std::fs::create_dir_all(media_dir.join("2024/01/15")).unwrap();
431+
let staged = media_dir.join("2024/01/15/99999.jpg");
432+
std::fs::write(&staged, b"fake image data").unwrap();
433+
434+
builder.add_media_from_temp("media/2024/01/15/99999.jpg").unwrap();
435+
436+
// Staging file must be gone
437+
assert!(!staged.exists(), "staged file should have been removed");
438+
439+
builder.build().await.unwrap();
440+
441+
let file = std::fs::File::open(tmp.path()).unwrap();
442+
let mut archive = ZipArchive::new(file).unwrap();
443+
let mut entry = archive.by_name("media/2024/01/15/99999.jpg")
444+
.expect("media entry missing from zip");
445+
let mut contents = vec![];
446+
std::io::Read::read_to_end(&mut entry, &mut contents).unwrap();
447+
assert_eq!(contents, b"fake image data");
448+
}
449+
444450
/// Build a minimal archive with no occurrences and the given extensions enabled,
445451
/// return the list of file names present in the ZIP.
446452
async fn zip_file_names(extensions: Vec<DwcaExtension>) -> Vec<String> {
447-
let metadata = Metadata::default();
448-
let builder = ArchiveBuilder::new(extensions, metadata, &std::env::temp_dir()).unwrap();
449453
let tmp = tempfile::NamedTempFile::new().unwrap();
450-
let path = tmp.path().to_str().unwrap().to_string();
451-
builder.build(&path).await.unwrap();
452-
let file = std::fs::File::open(&path).unwrap();
454+
let builder = ArchiveBuilder::new(extensions, Metadata::default(), tmp.path()).unwrap();
455+
builder.build().await.unwrap();
456+
let file = std::fs::File::open(tmp.path()).unwrap();
453457
let mut archive = ZipArchive::new(file).unwrap();
454458
(0..archive.len())
455459
.map(|i| archive.by_index(i).unwrap().name().to_string())
@@ -458,15 +462,14 @@ mod tests {
458462

459463
#[tokio::test]
460464
async fn test_chuck_json_written_when_inat_query_present() {
465+
let tmp = tempfile::NamedTempFile::new().unwrap();
461466
let metadata = Metadata {
462467
inat_query: Some("taxon_id=47790".to_string()),
463468
..Default::default()
464469
};
465-
let builder = ArchiveBuilder::new(vec![], metadata, &std::env::temp_dir()).unwrap();
466-
let tmp = tempfile::NamedTempFile::new().unwrap();
467-
let path = tmp.path().to_str().unwrap().to_string();
468-
builder.build(&path).await.unwrap();
469-
let file = std::fs::File::open(&path).unwrap();
470+
let builder = ArchiveBuilder::new(vec![], metadata, tmp.path()).unwrap();
471+
builder.build().await.unwrap();
472+
let file = std::fs::File::open(tmp.path()).unwrap();
470473
let mut archive = ZipArchive::new(file).unwrap();
471474
let mut chuck_file = archive.by_name("chuck.json").expect("chuck.json missing");
472475
let mut contents = String::new();

0 commit comments

Comments
 (0)