11use std:: fs:: File ;
22use std:: io:: Write ;
3- use std:: path:: { PathBuf } ;
3+ use std:: path:: { Path , PathBuf } ;
44use tempfile:: TempDir ;
55use zip:: write:: { FileOptions , ZipWriter } ;
66use zip:: CompressionMethod ;
@@ -16,6 +16,9 @@ use crate::darwin_core::{
1616/// A DarwinCore Archive builder that can stream occurrence records and generate a compliant ZIP archive
1717pub struct ArchiveBuilder {
1818 temp_dir : TempDir ,
19+ zip : ZipWriter < File > ,
20+ /// The final destination path; the ZIP is written to a temp file and renamed here on success.
21+ output_path : PathBuf ,
1922 occurrence_writer : csv:: Writer < File > ,
2023 multimedia_writer : Option < csv:: Writer < File > > ,
2124 audiovisual_writer : Option < csv:: Writer < File > > ,
@@ -32,30 +35,37 @@ pub struct ArchiveBuilder {
3235 audiovisual_file_path : PathBuf ,
3336 identification_file_path : PathBuf ,
3437 comment_file_path : PathBuf ,
35- media_dir_path : PathBuf ,
3638 metadata : Metadata ,
3739}
3840
3941impl ArchiveBuilder {
4042 /// Create a new DarwinCore Archive builder.
41- /// `base_dir` is the directory in which the temporary working directory is created;
42- /// pass the parent of the output ZIP so temp files land on the same filesystem .
43+ /// Opens the output ZIP file immediately; pass the final output path so that
44+ /// temp files land on the same filesystem (important on Linux where /tmp may be tmpfs) .
4345 pub fn new (
4446 dwc_extensions : Vec < crate :: DwcaExtension > ,
4547 metadata : Metadata ,
46- base_dir : & std :: path :: Path ,
48+ output_path : & Path ,
4749 ) -> Result < Self , Box < dyn std:: error:: Error > > {
50+ let base_dir = output_path. parent ( ) . unwrap_or ( Path :: new ( "." ) ) ;
4851 let temp_dir = TempDir :: new_in ( base_dir) ?;
4952 let occurrence_file_path = temp_dir. path ( ) . join ( "occurrence.csv" ) ;
5053 let multimedia_file_path = temp_dir. path ( ) . join ( "multimedia.csv" ) ;
5154 let audiovisual_file_path = temp_dir. path ( ) . join ( "audiovisual.csv" ) ;
5255 let identification_file_path = temp_dir. path ( ) . join ( "identification.csv" ) ;
5356 let comment_file_path = temp_dir. path ( ) . join ( "comment.csv" ) ;
54- let media_dir_path = temp_dir. path ( ) . join ( "media" ) ;
5557
56- // Create media directory
58+ // Create media staging directory inside temp dir
59+ let media_dir_path = temp_dir. path ( ) . join ( "media" ) ;
5760 std:: fs:: create_dir_all ( & media_dir_path) ?;
5861
62+ // Write the ZIP to a temp file inside the temp dir so that a cancelled or failed
63+ // download leaves no partial file at the final output path. On successful build()
64+ // the temp ZIP is renamed to output_path (same filesystem → atomic rename).
65+ let zip_temp_path = temp_dir. path ( ) . join ( "archive.zip" ) ;
66+ let zip_file = File :: create ( & zip_temp_path) ?;
67+ let zip = ZipWriter :: new ( zip_file) ;
68+
5969 // Create CSV writer for occurrence records
6070 let occurrence_file = File :: create ( & occurrence_file_path) ?;
6171 let mut occurrence_writer = csv:: WriterBuilder :: new ( )
@@ -68,6 +78,8 @@ impl ArchiveBuilder {
6878
6979 Ok ( Self {
7080 temp_dir,
81+ zip,
82+ output_path : output_path. to_path_buf ( ) ,
7183 occurrence_writer,
7284 multimedia_writer : None ,
7385 audiovisual_writer : None ,
@@ -84,14 +96,34 @@ impl ArchiveBuilder {
8496 audiovisual_file_path,
8597 identification_file_path,
8698 comment_file_path,
87- media_dir_path,
8899 metadata,
89100 } )
90101 }
91102
92- /// Get the media directory path for downloading photos
93- pub fn media_dir ( & self ) -> & std:: path:: Path {
94- & self . media_dir_path
103+ /// Get the media staging directory path for downloading files before adding to the ZIP.
104+ pub fn media_dir ( & self ) -> PathBuf {
105+ self . temp_dir . path ( ) . join ( "media" )
106+ }
107+
108+ /// Stream a staged media file into the open ZIP and remove it from the staging directory.
109+ /// `rel_zip_path` is the path as it should appear in the ZIP (e.g. `"media/2024/01/15/12345.jpg"`).
110+ /// The file must exist at `temp_dir / rel_zip_path`.
111+ pub fn add_media_from_temp (
112+ & mut self ,
113+ rel_zip_path : & str ,
114+ ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
115+ let local_path = self . temp_dir . path ( ) . join ( rel_zip_path) ;
116+ if !local_path. exists ( ) {
117+ return Ok ( ( ) ) ;
118+ }
119+ let zip_opts: FileOptions < ( ) > = FileOptions :: default ( )
120+ . compression_method ( CompressionMethod :: Stored )
121+ . unix_permissions ( 0o644 ) ;
122+ self . zip . start_file ( rel_zip_path, zip_opts) ?;
123+ let mut file = File :: open ( & local_path) ?;
124+ std:: io:: copy ( & mut file, & mut self . zip ) ?;
125+ std:: fs:: remove_file ( & local_path) ?;
126+ Ok ( ( ) )
95127 }
96128
97129 /// Add a batch of DarwinCore occurrences to the archive
@@ -237,8 +269,9 @@ impl ArchiveBuilder {
237269 Ok ( ( ) )
238270 }
239271
240- /// Build the final archive and create the ZIP file
241- pub async fn build ( mut self , output_path : & str ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
272+ /// Finish writing the archive. All media must have been added via `add_media_from_temp`
273+ /// before calling this; only CSV files and metadata are written here.
274+ pub async fn build ( mut self ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
242275 // Ensure all CSV data is written
243276 self . occurrence_writer . flush ( ) ?;
244277 drop ( self . occurrence_writer ) ; // Close the file
@@ -277,34 +310,31 @@ impl ArchiveBuilder {
277310 let eml_file_path = self . temp_dir . path ( ) . join ( "eml.xml" ) ;
278311 std:: fs:: write ( & eml_file_path, eml_xml) ?;
279312
280- // Create ZIP archive
281- let zip_file = File :: create ( output_path) ?;
282- let mut zip = ZipWriter :: new ( zip_file) ;
283313 let options: FileOptions < ( ) > = FileOptions :: default ( )
284314 . compression_method ( CompressionMethod :: Deflated )
285315 . unix_permissions ( 0o644 ) ;
286316
287317 // Add meta.xml to ZIP
288- zip. start_file ( "meta.xml" , options) ?;
318+ self . zip . start_file ( "meta.xml" , options) ?;
289319 let meta_content = std:: fs:: read ( & meta_file_path) ?;
290- zip. write_all ( & meta_content) ?;
320+ self . zip . write_all ( & meta_content) ?;
291321
292322 // Add eml.xml to ZIP
293- zip. start_file ( "eml.xml" , options) ?;
323+ self . zip . start_file ( "eml.xml" , options) ?;
294324 let eml_content = std:: fs:: read ( & eml_file_path) ?;
295- zip. write_all ( & eml_content) ?;
325+ self . zip . write_all ( & eml_content) ?;
296326
297327 // Add chuck.json if inat_query is set
298328 if let Some ( ref inat_query) = self . metadata . inat_query {
299329 let chuck_json = serde_json:: json!( { "inat_query" : inat_query } ) . to_string ( ) ;
300- zip. start_file ( "chuck.json" , options) ?;
301- zip. write_all ( chuck_json. as_bytes ( ) ) ?;
330+ self . zip . start_file ( "chuck.json" , options) ?;
331+ self . zip . write_all ( chuck_json. as_bytes ( ) ) ?;
302332 }
303333
304334 // Add occurrence.csv to ZIP
305- zip. start_file ( "occurrence.csv" , options) ?;
335+ self . zip . start_file ( "occurrence.csv" , options) ?;
306336 let occurrence_content = std:: fs:: read ( & self . occurrence_file_path ) ?;
307- zip. write_all ( & occurrence_content) ?;
337+ self . zip . write_all ( & occurrence_content) ?;
308338
309339 // Add extension CSVs to ZIP for all enabled extensions, even if empty
310340 let ext_specs: & [ ( crate :: DwcaExtension , & str , & std:: path:: Path , Vec < & str > ) ] = & [
@@ -346,75 +376,24 @@ impl ArchiveBuilder {
346376 wtr. write_record ( headers) ?;
347377 wtr. flush ( ) ?;
348378 }
349- zip. start_file ( * zip_name, options) ?;
350- zip. write_all ( & std:: fs:: read ( file_path) ?) ?;
351- }
352-
353- // Add media directory contents to ZIP if it exists and has files
354- println ! ( "Zipping media..." ) ;
355- Self :: add_directory_to_zip ( & mut zip, & self . media_dir_path , "media" ) ?;
356-
357- // Finish ZIP
358- zip. finish ( ) ?;
359-
360- println ! ( "DarwinCore Archive created: {output_path}" ) ;
361- println ! ( "Records exported: {}" , self . record_count) ;
362- if self . multimedia_count > 0 {
363- println ! ( "Multimedia records exported: {}" , self . multimedia_count) ;
379+ self . zip . start_file ( * zip_name, options) ?;
380+ self . zip . write_all ( & std:: fs:: read ( file_path) ?) ?;
364381 }
365- if self . audiovisual_count > 0 {
366- println ! ( "Audiovisual records exported: {}" , self . audiovisual_count) ;
367- }
368- if self . identification_count > 0 {
369- println ! ( "Identification records exported: {}" , self . identification_count) ;
370- }
371- if self . comment_count > 0 {
372- println ! ( "Comment records exported: {}" , self . comment_count) ;
373- }
374-
375- Ok ( ( ) )
376- }
377382
378- /// Add directory contents to ZIP archive using streaming to avoid loading entire files into memory
379- fn add_directory_to_zip (
380- zip : & mut ZipWriter < File > ,
381- dir_path : & std:: path:: Path ,
382- zip_prefix : & str ,
383- ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
384- if !dir_path. exists ( ) || !dir_path. is_dir ( ) {
385- return Ok ( ( ) ) ;
386- }
383+ // Finish ZIP (writes central directory)
384+ let zip_temp_path = self . temp_dir . path ( ) . join ( "archive.zip" ) ;
385+ self . zip . finish ( ) ?;
387386
388- // There's no point in trying to compress JPGs
389- let zip_opts: FileOptions < ( ) > = FileOptions :: default ( )
390- . compression_method ( CompressionMethod :: Stored )
391- . unix_permissions ( 0o644 ) ;
387+ // Rename the temp ZIP to the final output path. Both are on the same filesystem
388+ // so this is an atomic rename on most systems.
389+ std:: fs:: rename ( & zip_temp_path, & self . output_path ) ?;
392390
393- for entry in std:: fs:: read_dir ( dir_path) ? {
394- let entry = entry?;
395- let path = entry. path ( ) ;
396-
397- if path. is_file ( ) {
398- let file_name = path. file_name ( )
399- . and_then ( |name| name. to_str ( ) )
400- . ok_or ( "Invalid filename" ) ?;
401-
402- let zip_path = format ! ( "{zip_prefix}/{file_name}" ) ;
403- zip. start_file ( zip_path, zip_opts) ?;
404-
405- // Stream the file contents instead of reading entirely into memory
406- let mut file = File :: open ( & path) ?;
407- std:: io:: copy ( & mut file, zip) ?;
408- } else if path. is_dir ( ) {
409- // Recursively add subdirectory contents
410- let dir_name = path. file_name ( )
411- . and_then ( |name| name. to_str ( ) )
412- . ok_or ( "Invalid directory name" ) ?;
413-
414- let subdir_zip_prefix = format ! ( "{zip_prefix}/{dir_name}" ) ;
415- Self :: add_directory_to_zip ( zip, & path, & subdir_zip_prefix) ?;
416- }
417- }
391+ log:: info!(
392+ "DarwinCore Archive complete: {} records, {} multimedia, {} audiovisual, \
393+ {} identifications, {} comments",
394+ self . record_count, self . multimedia_count, self . audiovisual_count,
395+ self . identification_count, self . comment_count,
396+ ) ;
418397
419398 Ok ( ( ) )
420399 }
@@ -428,28 +407,53 @@ mod tests {
428407 use zip:: ZipArchive ;
429408
430409 #[ test]
431- fn test_archive_builder_temp_dir_in_base_dir ( ) {
432- let base = tempfile:: TempDir :: new ( ) . unwrap ( ) ;
433- let metadata = Metadata :: default ( ) ;
434- let builder = ArchiveBuilder :: new ( vec ! [ ] , metadata , base . path ( ) ) . unwrap ( ) ;
435- // media_dir lives inside temp_dir, so it must be under base
410+ fn test_archive_temp_files_in_same_dir_as_output ( ) {
411+ let output_dir = tempfile:: TempDir :: new ( ) . unwrap ( ) ;
412+ let output_path = output_dir . path ( ) . join ( "test.zip" ) ;
413+ let builder = ArchiveBuilder :: new ( vec ! [ ] , Metadata :: default ( ) , & output_path ) . unwrap ( ) ;
414+ // media_dir lives inside temp_dir, which must be under the output's parent dir
436415 assert ! (
437- builder. media_dir( ) . starts_with( base . path( ) ) ,
416+ builder. media_dir( ) . starts_with( output_dir . path( ) ) ,
438417 "expected media_dir {:?} to be under {:?}" ,
439418 builder. media_dir( ) ,
440- base . path( )
419+ output_dir . path( )
441420 ) ;
442421 }
443422
423+ #[ tokio:: test]
424+ async fn test_add_media_from_temp_adds_to_zip_and_removes_local_file ( ) {
425+ let tmp = tempfile:: NamedTempFile :: new ( ) . unwrap ( ) ;
426+ let mut builder = ArchiveBuilder :: new ( vec ! [ ] , Metadata :: default ( ) , tmp. path ( ) ) . unwrap ( ) ;
427+
428+ // Create a fake media file in the staging dir
429+ let media_dir = builder. media_dir ( ) ;
430+ std:: fs:: create_dir_all ( media_dir. join ( "2024/01/15" ) ) . unwrap ( ) ;
431+ let staged = media_dir. join ( "2024/01/15/99999.jpg" ) ;
432+ std:: fs:: write ( & staged, b"fake image data" ) . unwrap ( ) ;
433+
434+ builder. add_media_from_temp ( "media/2024/01/15/99999.jpg" ) . unwrap ( ) ;
435+
436+ // Staging file must be gone
437+ assert ! ( !staged. exists( ) , "staged file should have been removed" ) ;
438+
439+ builder. build ( ) . await . unwrap ( ) ;
440+
441+ let file = std:: fs:: File :: open ( tmp. path ( ) ) . unwrap ( ) ;
442+ let mut archive = ZipArchive :: new ( file) . unwrap ( ) ;
443+ let mut entry = archive. by_name ( "media/2024/01/15/99999.jpg" )
444+ . expect ( "media entry missing from zip" ) ;
445+ let mut contents = vec ! [ ] ;
446+ std:: io:: Read :: read_to_end ( & mut entry, & mut contents) . unwrap ( ) ;
447+ assert_eq ! ( contents, b"fake image data" ) ;
448+ }
449+
444450 /// Build a minimal archive with no occurrences and the given extensions enabled,
445451 /// return the list of file names present in the ZIP.
446452 async fn zip_file_names ( extensions : Vec < DwcaExtension > ) -> Vec < String > {
447- let metadata = Metadata :: default ( ) ;
448- let builder = ArchiveBuilder :: new ( extensions, metadata, & std:: env:: temp_dir ( ) ) . unwrap ( ) ;
449453 let tmp = tempfile:: NamedTempFile :: new ( ) . unwrap ( ) ;
450- let path = tmp. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ;
451- builder. build ( & path ) . await . unwrap ( ) ;
452- let file = std:: fs:: File :: open ( & path) . unwrap ( ) ;
454+ let builder = ArchiveBuilder :: new ( extensions , Metadata :: default ( ) , tmp. path ( ) ) . unwrap ( ) ;
455+ builder. build ( ) . await . unwrap ( ) ;
456+ let file = std:: fs:: File :: open ( tmp . path ( ) ) . unwrap ( ) ;
453457 let mut archive = ZipArchive :: new ( file) . unwrap ( ) ;
454458 ( 0 ..archive. len ( ) )
455459 . map ( |i| archive. by_index ( i) . unwrap ( ) . name ( ) . to_string ( ) )
@@ -458,15 +462,14 @@ mod tests {
458462
459463 #[ tokio:: test]
460464 async fn test_chuck_json_written_when_inat_query_present ( ) {
465+ let tmp = tempfile:: NamedTempFile :: new ( ) . unwrap ( ) ;
461466 let metadata = Metadata {
462467 inat_query : Some ( "taxon_id=47790" . to_string ( ) ) ,
463468 ..Default :: default ( )
464469 } ;
465- let builder = ArchiveBuilder :: new ( vec ! [ ] , metadata, & std:: env:: temp_dir ( ) ) . unwrap ( ) ;
466- let tmp = tempfile:: NamedTempFile :: new ( ) . unwrap ( ) ;
467- let path = tmp. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ;
468- builder. build ( & path) . await . unwrap ( ) ;
469- let file = std:: fs:: File :: open ( & path) . unwrap ( ) ;
470+ let builder = ArchiveBuilder :: new ( vec ! [ ] , metadata, tmp. path ( ) ) . unwrap ( ) ;
471+ builder. build ( ) . await . unwrap ( ) ;
472+ let file = std:: fs:: File :: open ( tmp. path ( ) ) . unwrap ( ) ;
470473 let mut archive = ZipArchive :: new ( file) . unwrap ( ) ;
471474 let mut chuck_file = archive. by_name ( "chuck.json" ) . expect ( "chuck.json missing" ) ;
472475 let mut contents = String :: new ( ) ;
0 commit comments