|
| 1 | +import os |
| 2 | +import requests |
| 3 | +import json |
| 4 | +import tempfile |
| 5 | +import zipfile |
| 6 | +import time |
| 7 | +from ..api.documents import get_bulk_download_url |
| 8 | +from ..api.datasets import get_dataset |
| 9 | +from ..api.files import get_file_details |
| 10 | +from ...document import Document |
| 11 | + |
| 12 | +def download_document_collection(dataset_id, document_ids=None, timeout=20, chunk_size=2000): |
| 13 | + """ |
| 14 | + Download documents using bulk download with chunking. |
| 15 | + """ |
| 16 | + if not document_ids: |
| 17 | + # Avoid circular import by importing inside function |
| 18 | + from ...sync.internal.document_utils import list_remote_document_ids |
| 19 | + print('No document IDs provided; fetching all document IDs from the server...') |
| 20 | + id_map = list_remote_document_ids(dataset_id) |
| 21 | + document_ids = id_map['apiId'] |
| 22 | + if not document_ids: |
| 23 | + return [] |
| 24 | + |
| 25 | + num_docs = len(document_ids) |
| 26 | + num_chunks = (num_docs + chunk_size - 1) // chunk_size |
| 27 | + document_chunks = [document_ids[i:i + chunk_size] for i in range(0, num_docs, chunk_size)] |
| 28 | + |
| 29 | + all_document_structs = [] |
| 30 | + print(f'Beginning download of {num_docs} documents in {num_chunks} chunk(s).') |
| 31 | + |
| 32 | + for c, chunk_doc_ids in enumerate(document_chunks, 1): |
| 33 | + print(f' Processing chunk {c} of {num_chunks} ({len(chunk_doc_ids)} documents)...') |
| 34 | + |
| 35 | + success, download_url, _, _ = get_bulk_download_url(dataset_id, chunk_doc_ids) |
| 36 | + if not success: |
| 37 | + raise RuntimeError(f"Failed to get bulk download URL for chunk {c}") |
| 38 | + |
| 39 | + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip: |
| 40 | + temp_zip_path = temp_zip.name |
| 41 | + |
| 42 | + try: |
| 43 | + is_finished = False |
| 44 | + start_time = time.time() |
| 45 | + |
| 46 | + while not is_finished and (time.time() - start_time) < timeout: |
| 47 | + try: |
| 48 | + response = requests.get(download_url, stream=True) |
| 49 | + if response.status_code == 200: |
| 50 | + with open(temp_zip_path, 'wb') as f: |
| 51 | + for chunk in response.iter_content(chunk_size=8192): |
| 52 | + f.write(chunk) |
| 53 | + is_finished = True |
| 54 | + else: |
| 55 | + time.sleep(1) |
| 56 | + except Exception: |
| 57 | + time.sleep(1) |
| 58 | + |
| 59 | + if not is_finished: |
| 60 | + raise RuntimeError(f"Download failed for chunk {c} after timeout") |
| 61 | + |
| 62 | + with tempfile.TemporaryDirectory() as extract_dir: |
| 63 | + with zipfile.ZipFile(temp_zip_path, 'r') as zip_ref: |
| 64 | + zip_ref.extractall(extract_dir) |
| 65 | + |
| 66 | + # Assume one JSON file per chunk as per Matlab logic (unzippedFiles{1}) |
| 67 | + # But zip might contain multiple files. Matlab code: jsonFile = unzippedFiles{1} |
| 68 | + # We iterate over extracted files |
| 69 | + for filename in os.listdir(extract_dir): |
| 70 | + if filename.endswith('.json'): |
| 71 | + with open(os.path.join(extract_dir, filename), 'r') as f: |
| 72 | + # Handling potential NaN/Null is skipped for now, assuming standard JSON |
| 73 | + document_structs = json.load(f) |
| 74 | + # dropDuplicateDocsFromJsonDecode logic is skipped for now |
| 75 | + if isinstance(document_structs, list): |
| 76 | + all_document_structs.extend(document_structs) |
| 77 | + else: |
| 78 | + all_document_structs.append(document_structs) |
| 79 | + finally: |
| 80 | + if os.path.exists(temp_zip_path): |
| 81 | + os.remove(temp_zip_path) |
| 82 | + |
| 83 | + print(f'Download complete. Converting {len(all_document_structs)} structs to NDI documents...') |
| 84 | + documents = [Document(d) for d in all_document_structs] |
| 85 | + print('Processing complete.') |
| 86 | + return documents |
| 87 | + |
| 88 | +def download_dataset_files(cloud_dataset_id, target_folder, file_uuids=None, verbose=True, abort_on_error=True): |
| 89 | + """ |
| 90 | + Downloads dataset files from a cloud dataset. |
| 91 | + """ |
| 92 | + success, dataset_info, _, _ = get_dataset(cloud_dataset_id) |
| 93 | + if not success: |
| 94 | + raise RuntimeError(f"Failed to get dataset: {dataset_info}") |
| 95 | + |
| 96 | + if 'files' not in dataset_info and file_uuids is not None: |
| 97 | + raise RuntimeError('No files found in the dataset despite files requested.') |
| 98 | + |
| 99 | + if 'files' not in dataset_info: |
| 100 | + return |
| 101 | + |
| 102 | + files = _filter_files_to_download(dataset_info['files'], file_uuids) |
| 103 | + num_files = len(files) |
| 104 | + |
| 105 | + if verbose: |
| 106 | + print(f'Will download {num_files} files...') |
| 107 | + |
| 108 | + for i, file_info in enumerate(files, 1): |
| 109 | + if verbose: |
| 110 | + _display_progress(i, num_files) |
| 111 | + |
| 112 | + file_uid = file_info['uid'] |
| 113 | + exists_on_cloud = file_info.get('uploaded', False) |
| 114 | + |
| 115 | + if not exists_on_cloud: |
| 116 | + print(f'Warning: File with uuid "{file_uid}" does not exist on the cloud, skipping...') |
| 117 | + continue |
| 118 | + |
| 119 | + target_filepath = os.path.join(target_folder, file_uid) |
| 120 | + if os.path.exists(target_filepath): |
| 121 | + if verbose: |
| 122 | + print(f'File {i} already exists locally, skipping...') |
| 123 | + continue |
| 124 | + |
| 125 | + success, answer, _, _ = get_file_details(cloud_dataset_id, file_uid) |
| 126 | + if not success: |
| 127 | + print(f"Warning: Failed to get file details: {answer}") |
| 128 | + continue |
| 129 | + |
| 130 | + download_url = answer.get('downloadUrl') |
| 131 | + if not download_url: |
| 132 | + print(f"Warning: No download URL for file {file_uid}") |
| 133 | + continue |
| 134 | + |
| 135 | + try: |
| 136 | + response = requests.get(download_url, stream=True) |
| 137 | + response.raise_for_status() |
| 138 | + with open(target_filepath, 'wb') as f: |
| 139 | + for chunk in response.iter_content(chunk_size=8192): |
| 140 | + f.write(chunk) |
| 141 | + except Exception as e: |
| 142 | + if abort_on_error: |
| 143 | + raise e |
| 144 | + else: |
| 145 | + print(f"Warning: Download failed for file {i}: {e}") |
| 146 | + |
| 147 | + if verbose: |
| 148 | + print('File download complete.') |
| 149 | + |
| 150 | +def _filter_files_to_download(files, file_uuids): |
| 151 | + if file_uuids is not None: |
| 152 | + # Assuming file_uuids is a list of strings |
| 153 | + # Filter files where uid is in file_uuids |
| 154 | + filtered_files = [f for f in files if f['uid'] in file_uuids] |
| 155 | + return filtered_files |
| 156 | + return files |
| 157 | + |
| 158 | +def _display_progress(current_file_number, total_file_number): |
| 159 | + percent_finished = round((current_file_number / total_file_number) * 100) |
| 160 | + print(f'Downloading file {current_file_number} of {total_file_number} ({percent_finished}% complete) ...') |
0 commit comments