VH-Lab
diff --git a/‎src/ndi/cloud/api/datasets/get_dataset.py‎
Lines changed: 14 additions & 0 deletions b/‎src/ndi/cloud/api/datasets/get_dataset.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/ndi/cloud/api/documents.py‎
Lines changed: 15 additions & 0 deletions b/‎src/ndi/cloud/api/documents.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/ndi/cloud/api/files/__init__.py‎ b/‎src/ndi/cloud/api/files/__init__.py‎
diff --git a/‎src/ndi/cloud/api/files/get_file_details.py‎
Lines changed: 15 additions & 0 deletions b/‎src/ndi/cloud/api/files/get_file_details.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/ndi/cloud/api/implementation/datasets/get_dataset.py‎
Lines changed: 43 additions & 0 deletions b/‎src/ndi/cloud/api/implementation/datasets/get_dataset.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎src/ndi/cloud/api/implementation/documents/get_bulk_download_url.py‎
Lines changed: 54 additions & 0 deletions b/‎src/ndi/cloud/api/implementation/documents/get_bulk_download_url.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/ndi/cloud/api/implementation/files/get_file_details.py‎
Lines changed: 45 additions & 0 deletions b/‎src/ndi/cloud/api/implementation/files/get_file_details.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎src/ndi/cloud/download/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/ndi/cloud/download/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/ndi/cloud/download/download_utils.py‎
Lines changed: 160 additions & 0 deletions b/‎src/ndi/cloud/download/download_utils.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎src/ndi/cloud/internal/get_cloud_dataset_id_for_local_dataset.py‎
Lines changed: 18 additions & 0 deletions b/‎src/ndi/cloud/internal/get_cloud_dataset_id_for_local_dataset.py‎
Lines changed: 18 additions & 0 deletions
@@ -0,0 +1,14 @@
+from ..implementation.datasets.get_dataset import GetDataset as GetDatasetImpl
+
+def get_dataset(dataset_id):
+    """
+    User-facing wrapper to get dataset details.
+
+    Args:
+        dataset_id (str): The ID of the dataset.
+
+    Returns:
+        tuple: (success, answer, response, url)
+    """
+    api_call = GetDatasetImpl(dataset_id)
+    return api_call.execute()
@@ -4,6 +4,7 @@
 from .implementation.documents.delete_document import DeleteDocument as DeleteDocumentImpl
 from .implementation.documents.list_dataset_documents import ListDatasetDocuments as ListDatasetDocumentsImpl
 from .implementation.documents.list_dataset_documents_all import ListDatasetDocumentsAll as ListDatasetDocumentsAllImpl
+from .implementation.documents.get_bulk_download_url import GetBulkDownloadURL as GetBulkDownloadURLImpl
 
 def add_document(dataset_id, document_info):
     """
@@ -90,3 +91,17 @@ def list_dataset_documents_all(dataset_id, page_size=20):
     """
     api_call = ListDatasetDocumentsAllImpl(dataset_id, page_size)
     return api_call.execute()
+
+def get_bulk_download_url(dataset_id, document_ids=None):
+    """
+    Retrieves a pre-signed URL for bulk document download.
+
+    Args:
+        dataset_id (str): The ID of the dataset.
+        document_ids (list of str, optional): List of cloud document IDs to download.
+
+    Returns:
+        tuple: (success, answer, response, url)
+    """
+    api_call = GetBulkDownloadURLImpl(dataset_id, document_ids)
+    return api_call.execute()
@@ -0,0 +1,15 @@
+from ..implementation.files.get_file_details import GetFileDetails as GetFileDetailsImpl
+
+def get_file_details(dataset_id, file_uid):
+    """
+    User-facing wrapper to get file details.
+
+    Args:
+        dataset_id (str): The ID of the dataset.
+        file_uid (str): The UID of the file.
+
+    Returns:
+        tuple: (success, answer, response, url)
+    """
+    api_call = GetFileDetailsImpl(dataset_id, file_uid)
+    return api_call.execute()
@@ -0,0 +1,43 @@
+from ...call import Call
+from ... import url
+from ....authenticate import authenticate
+import requests
+import json
+
+class GetDataset(Call):
+    """
+    Implementation class for getting dataset details.
+    """
+
+    def __init__(self, dataset_id):
+        """
+        Creates a new GetDataset API call object.
+
+        Args:
+            dataset_id (str): The ID of the dataset.
+        """
+        self.dataset_id = dataset_id
+        self.endpoint_name = 'get_dataset'
+
+    def execute(self):
+        """
+        Performs the API call.
+        """
+        token = authenticate()
+        api_url = url.get_url(self.endpoint_name, dataset_id=self.dataset_id)
+
+        headers = {
+            'Accept': 'application/json',
+            'Authorization': f'Bearer {token}'
+        }
+
+        response = requests.get(api_url, headers=headers)
+
+        if response.status_code == 200:
+            return True, response.json(), response, api_url
+        else:
+            try:
+                answer = response.json()
+            except json.JSONDecodeError:
+                answer = response.text
+            return False, answer, response, api_url
@@ -0,0 +1,54 @@
+from ...call import Call
+from ... import url
+from ....authenticate import authenticate
+import requests
+import json
+
+class GetBulkDownloadURL(Call):
+    """
+    Implementation class for getting a bulk download URL.
+    """
+
+    def __init__(self, dataset_id, document_ids=None):
+        """
+        Creates a new GetBulkDownloadURL API call object.
+
+        Args:
+            dataset_id (str): The ID of the dataset.
+            document_ids (list of str, optional): List of cloud document IDs to download.
+                                                  If None or empty, all documents are included.
+        """
+        self.dataset_id = dataset_id
+        self.document_ids = document_ids if document_ids is not None else []
+        self.endpoint_name = 'bulk_download_documents'
+
+    def execute(self):
+        """
+        Performs the API call.
+        """
+        token = authenticate()
+        api_url = url.get_url(self.endpoint_name, dataset_id=self.dataset_id)
+
+        headers = {
+            'Accept': 'application/json',
+            'Content-Type': 'application/json',
+            'Authorization': f'Bearer {token}'
+        }
+
+        # The body specifies which document IDs to include
+        data = {'documentIds': self.document_ids}
+
+        response = requests.post(api_url, headers=headers, json=data)
+
+        if response.status_code in [200, 201]:
+            try:
+                answer = response.json().get('url')
+                return True, answer, response, api_url
+            except json.JSONDecodeError:
+                return False, response.text, response, api_url
+        else:
+            try:
+                answer = response.json()
+            except json.JSONDecodeError:
+                answer = response.text
+            return False, answer, response, api_url
@@ -0,0 +1,45 @@
+from ...call import Call
+from ... import url
+from ....authenticate import authenticate
+import requests
+import json
+
+class GetFileDetails(Call):
+    """
+    Implementation class for getting file details.
+    """
+
+    def __init__(self, dataset_id, file_uid):
+        """
+        Creates a new GetFileDetails API call object.
+
+        Args:
+            dataset_id (str): The ID of the dataset.
+            file_uid (str): The UID of the file.
+        """
+        self.dataset_id = dataset_id
+        self.file_uid = file_uid
+        self.endpoint_name = 'get_file_details'
+
+    def execute(self):
+        """
+        Performs the API call.
+        """
+        token = authenticate()
+        api_url = url.get_url(self.endpoint_name, dataset_id=self.dataset_id, file_uid=self.file_uid)
+
+        headers = {
+            'Accept': 'application/json',
+            'Authorization': f'Bearer {token}'
+        }
+
+        response = requests.get(api_url, headers=headers)
+
+        if response.status_code == 200:
+            return True, response.json(), response, api_url
+        else:
+            try:
+                answer = response.json()
+            except json.JSONDecodeError:
+                answer = response.text
+            return False, answer, response, api_url
@@ -0,0 +1 @@
+from .download_utils import download_document_collection, download_dataset_files
@@ -0,0 +1,160 @@
+import os
+import requests
+import json
+import tempfile
+import zipfile
+import time
+from ..api.documents import get_bulk_download_url
+from ..api.datasets import get_dataset
+from ..api.files import get_file_details
+from ...document import Document
+
+def download_document_collection(dataset_id, document_ids=None, timeout=20, chunk_size=2000):
+    """
+    Download documents using bulk download with chunking.
+    """
+    if not document_ids:
+        # Avoid circular import by importing inside function
+        from ...sync.internal.document_utils import list_remote_document_ids
+        print('No document IDs provided; fetching all document IDs from the server...')
+        id_map = list_remote_document_ids(dataset_id)
+        document_ids = id_map['apiId']
+        if not document_ids:
+            return []
+
+    num_docs = len(document_ids)
+    num_chunks = (num_docs + chunk_size - 1) // chunk_size
+    document_chunks = [document_ids[i:i + chunk_size] for i in range(0, num_docs, chunk_size)]
+
+    all_document_structs = []
+    print(f'Beginning download of {num_docs} documents in {num_chunks} chunk(s).')
+
+    for c, chunk_doc_ids in enumerate(document_chunks, 1):
+        print(f'  Processing chunk {c} of {num_chunks} ({len(chunk_doc_ids)} documents)...')
+
+        success, download_url, _, _ = get_bulk_download_url(dataset_id, chunk_doc_ids)
+        if not success:
+            raise RuntimeError(f"Failed to get bulk download URL for chunk {c}")
+
+        with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
+            temp_zip_path = temp_zip.name
+
+        try:
+            is_finished = False
+            start_time = time.time()
+
+            while not is_finished and (time.time() - start_time) < timeout:
+                try:
+                    response = requests.get(download_url, stream=True)
+                    if response.status_code == 200:
+                        with open(temp_zip_path, 'wb') as f:
+                            for chunk in response.iter_content(chunk_size=8192):
+                                f.write(chunk)
+                        is_finished = True
+                    else:
+                        time.sleep(1)
+                except Exception:
+                    time.sleep(1)
+
+            if not is_finished:
+                raise RuntimeError(f"Download failed for chunk {c} after timeout")
+
+            with tempfile.TemporaryDirectory() as extract_dir:
+                with zipfile.ZipFile(temp_zip_path, 'r') as zip_ref:
+                    zip_ref.extractall(extract_dir)
+
+                # Assume one JSON file per chunk as per Matlab logic (unzippedFiles{1})
+                # But zip might contain multiple files. Matlab code: jsonFile = unzippedFiles{1}
+                # We iterate over extracted files
+                for filename in os.listdir(extract_dir):
+                    if filename.endswith('.json'):
+                        with open(os.path.join(extract_dir, filename), 'r') as f:
+                            # Handling potential NaN/Null is skipped for now, assuming standard JSON
+                            document_structs = json.load(f)
+                            # dropDuplicateDocsFromJsonDecode logic is skipped for now
+                            if isinstance(document_structs, list):
+                                all_document_structs.extend(document_structs)
+                            else:
+                                all_document_structs.append(document_structs)
+        finally:
+            if os.path.exists(temp_zip_path):
+                os.remove(temp_zip_path)
+
+    print(f'Download complete. Converting {len(all_document_structs)} structs to NDI documents...')
+    documents = [Document(d) for d in all_document_structs]
+    print('Processing complete.')
+    return documents
+
+def download_dataset_files(cloud_dataset_id, target_folder, file_uuids=None, verbose=True, abort_on_error=True):
+    """
+    Downloads dataset files from a cloud dataset.
+    """
+    success, dataset_info, _, _ = get_dataset(cloud_dataset_id)
+    if not success:
+        raise RuntimeError(f"Failed to get dataset: {dataset_info}")
+
+    if 'files' not in dataset_info and file_uuids is not None:
+        raise RuntimeError('No files found in the dataset despite files requested.')
+
+    if 'files' not in dataset_info:
+        return
+
+    files = _filter_files_to_download(dataset_info['files'], file_uuids)
+    num_files = len(files)
+
+    if verbose:
+        print(f'Will download {num_files} files...')
+
+    for i, file_info in enumerate(files, 1):
+        if verbose:
+            _display_progress(i, num_files)
+
+        file_uid = file_info['uid']
+        exists_on_cloud = file_info.get('uploaded', False)
+
+        if not exists_on_cloud:
+            print(f'Warning: File with uuid "{file_uid}" does not exist on the cloud, skipping...')
+            continue
+
+        target_filepath = os.path.join(target_folder, file_uid)
+        if os.path.exists(target_filepath):
+            if verbose:
+                print(f'File {i} already exists locally, skipping...')
+            continue
+
+        success, answer, _, _ = get_file_details(cloud_dataset_id, file_uid)
+        if not success:
+            print(f"Warning: Failed to get file details: {answer}")
+            continue
+
+        download_url = answer.get('downloadUrl')
+        if not download_url:
+             print(f"Warning: No download URL for file {file_uid}")
+             continue
+
+        try:
+            response = requests.get(download_url, stream=True)
+            response.raise_for_status()
+            with open(target_filepath, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+        except Exception as e:
+            if abort_on_error:
+                raise e
+            else:
+                print(f"Warning: Download failed for file {i}: {e}")
+
+    if verbose:
+        print('File download complete.')
+
+def _filter_files_to_download(files, file_uuids):
+    if file_uuids is not None:
+        # Assuming file_uuids is a list of strings
+        # Filter files where uid is in file_uuids
+        filtered_files = [f for f in files if f['uid'] in file_uuids]
+        return filtered_files
+    return files
+
+def _display_progress(current_file_number, total_file_number):
+    percent_finished = round((current_file_number / total_file_number) * 100)
+    print(f'Downloading file {current_file_number} of {total_file_number} ({percent_finished}% complete) ...')
@@ -0,0 +1,18 @@
+from ...query import Query
+
+def get_cloud_dataset_id_for_local_dataset(ndi_dataset):
+    """
+    Retrieves the cloud dataset ID for a local dataset.
+    """
+    cloud_dataset_id_query = Query('', 'isa', 'dataset_remote')
+    # Assuming database_search returns a list of documents
+    cloud_dataset_id_documents = ndi_dataset.database_search(cloud_dataset_id_query)
+
+    if len(cloud_dataset_id_documents) > 1:
+        raise RuntimeError(f"Found more than one remote cloudDatasetId for the local dataset: {ndi_dataset.path}.")
+    elif cloud_dataset_id_documents:
+        # Assuming document structure
+        doc = cloud_dataset_id_documents[0]
+        return doc.document_properties['dataset_remote']['dataset_id'], cloud_dataset_id_documents
+    else:
+        return '', []
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .download_utils import download_document_collection, download_dataset_files`