@@ -84,7 +84,6 @@ def downloadDataset(
8484 from ndi .dataset import ndi_dataset_dir
8585
8686 documents = jsons2documents (doc_jsons )
87- conversion_lost = len (doc_jsons ) - len (documents )
8887 dataset = ndi_dataset_dir ("" , target , documents = documents )
8988
9089 # Create remote link document if not already present
@@ -113,85 +112,76 @@ def downloadDataset(
113112 if verbose :
114113 print (f' Files downloaded: { report ["downloaded" ]} , failed: { report ["failed" ]} ' )
115114
116- # Collect failures: conversion + exception-tracked + silent (DID-python)
117- add_failures : list [tuple [str , str ]] = list (getattr (dataset , "add_doc_failures" , []))
118-
119- # Cross-check using raw DID-python doc IDs (not isa('base') query,
120- # which might miss documents whose type info wasn't stored correctly).
115+ # Verify every downloaded document made it into the local database.
116+ # The local dataset may have *more* documents (e.g. session and
117+ # session-in-a-dataset docs created internally), so we only check
118+ # that every remote doc ID is present locally.
121119 db_ids = set (
122120 dataset ._session ._database ._driver ._db .get_doc_ids (
123121 dataset ._session ._database ._driver ._branch_id
124122 )
125123 )
126124
127- # Build a map from doc_id -> original JSON for missing-doc output
128- doc_json_by_id : dict [ str , dict ] = {}
125+ missing : list [ str ] = []
126+ missing_jsons : list [ dict ] = []
129127 for dj in doc_jsons :
130128 did = dj .get ("base" , {}).get ("id" , "" ) if isinstance (dj , dict ) else ""
131- if did :
132- doc_json_by_id [did ] = dj
133-
134- # Find documents that were "added" (no exception) but aren't in the DB
135- tracked_ids = {f [0 ] for f in add_failures }
136- silent_failures : list [str ] = []
137- for doc in documents :
138- doc_id = (
139- doc .document_properties .get ("base" , {}).get ("id" , "" )
140- if hasattr (doc , "document_properties" )
141- else doc .get ("base" , {}).get ("id" , "" )
142- )
143- if doc_id and doc_id not in db_ids and doc_id not in tracked_ids :
144- silent_failures .append (doc_id )
145-
146- total_lost = conversion_lost + len (add_failures ) + len (silent_failures )
129+ if did and did not in db_ids :
130+ missing .append (did )
131+ missing_jsons .append (dj )
147132
148133 if verbose :
149134 print ("Download complete." )
150135
151- if total_lost > 0 :
152- # Write missing documents to a JSON file for inspection
153- missing_docs_path = target / "missingDocuments.json"
154- missing_docs = []
155- for doc_id in silent_failures :
156- if doc_id in doc_json_by_id :
157- missing_docs .append (doc_json_by_id [doc_id ])
136+ if missing :
137+ # Print the document_class of each missing doc for diagnostics.
138+ # Session/dataset docs from older datasets are expected to be
139+ # missing (superseded by docs created locally during dataset init).
140+ session_dataset_types = {
141+ "ndi_session" ,
142+ "ndi_dataset" ,
143+ "session" ,
144+ "dataset" ,
145+ "session_in_a_dataset" ,
146+ "dataset_session_info" ,
147+ }
148+ real_missing : list [tuple [str , str ]] = []
149+ for doc_id , dj in zip (missing , missing_jsons ):
150+ doc_class = (
151+ dj .get ("document_class" , {}).get ("class_name" , "" ) if isinstance (dj , dict ) else ""
152+ )
153+ superclasses = (
154+ dj .get ("document_class" , {}).get ("superclasses" , []) if isinstance (dj , dict ) else []
155+ )
156+ all_types = {doc_class } | {
157+ sc .get ("class_name" , "" ) if isinstance (sc , dict ) else str (sc )
158+ for sc in (superclasses if isinstance (superclasses , list ) else [])
159+ }
160+ if all_types & session_dataset_types :
161+ print (
162+ f" Note: remote doc { doc_id } (class: { doc_class } ) "
163+ f"not in local DB — expected for session/dataset docs"
164+ )
158165 else :
159- missing_docs .append ({"base" : {"id" : doc_id }})
160- for doc_id , reason in add_failures :
161- entry = dict (doc_json_by_id .get (doc_id , {"base" : {"id" : doc_id }}))
162- entry ["_add_error" ] = reason
163- missing_docs .append (entry )
164- if missing_docs :
165- import json
166+ print (f" WARNING: remote doc { doc_id } (class: { doc_class } ) missing from local DB" )
167+ real_missing .append ((doc_id , doc_class ))
166168
167- missing_docs_path .write_text (json .dumps (missing_docs , indent = 2 , default = str ))
169+ if real_missing :
170+ missing_docs_path = target / "missingDocuments.json"
171+ import json
168172
169- lines = [
170- f"Downloaded { len (doc_jsons )} documents but only "
171- f"{ len (db_ids )} were added to the dataset. "
172- f"{ total_lost } document(s) lost:"
173- ]
174- if conversion_lost > 0 :
175- lines .append (f"\n { conversion_lost } failed to convert from JSON" " to ndi_document" )
176- if add_failures :
177- lines .append (f"\n { len (add_failures )} raised errors during" " database add:" )
178- for doc_id , reason in add_failures [:50 ]:
179- lines .append (f"\n - { doc_id } : { reason } " )
180- if len (add_failures ) > 50 :
181- lines .append (f"\n ... and { len (add_failures ) - 50 } more" )
182- if silent_failures :
183- lines .append (
184- f"\n { len (silent_failures )} were passed to"
185- " database.add() without error but are NOT in the"
186- " database (possible DID-python bug):"
187- )
188- for doc_id in silent_failures [:50 ]:
189- lines .append (f"\n - { doc_id } " )
190- if len (silent_failures ) > 50 :
191- lines .append (f"\n ... and { len (silent_failures ) - 50 } more" )
192- if missing_docs :
193- lines .append (f"\n Full JSON of missing documents written to:" f"\n { missing_docs_path } " )
194- raise RuntimeError ("" .join (lines ))
173+ missing_docs_path .write_text (json .dumps (missing_jsons , indent = 2 , default = str ))
174+
175+ lines = [
176+ f"Downloaded { len (doc_jsons )} documents but "
177+ f"{ len (real_missing )} are missing from the local dataset:"
178+ ]
179+ for doc_id , doc_class in real_missing [:50 ]:
180+ lines .append (f"\n - { doc_id } (class: { doc_class } )" )
181+ if len (real_missing ) > 50 :
182+ lines .append (f"\n ... and { len (real_missing ) - 50 } more" )
183+ lines .append (f"\n Full JSON of missing documents written to:\n { missing_docs_path } " )
184+ raise RuntimeError ("" .join (lines ))
195185
196186 return dataset
197187
0 commit comments