Merge pull request #55 from Waltham-Data-Science/claude/ndi-cloud-api-porting-wumxY

stevevanhooser · web-flow · commit cd0346ddaea4 · 2026-04-20T20:15:17.000-04:00
Add bulkFetch and documentClassCounts API functions
diff --git a/src/ndi/cloud/api/documents.py b/src/ndi/cloud/api/documents.py
@@ -10,13 +10,17 @@
 
 from __future__ import annotations
 
+import re
 from typing import Annotated, Any
 
 from pydantic import SkipValidation, validate_call
 
 from ..client import APIResponse, CloudClient, _auto_client
 from ._validators import VALIDATE_CONFIG, CloudId, FilePath, PageNumber, PageSize, Scope
 
+_HEX24 = re.compile(r"^[0-9a-fA-F]{24}$")
+_BULK_FETCH_MAX = 500
+
 _Client = Annotated[CloudClient | None, SkipValidation()]
 
 
@@ -178,6 +182,86 @@ def countDocuments(dataset_id: CloudId, *, client: _Client = None) -> int:
     return ds.get("documentCount", 0)
 
 
+@_auto_client
+@validate_call(config=VALIDATE_CONFIG)
+def bulkFetch(
+    dataset_id: CloudId,
+    doc_ids: list[str],
+    *,
+    client: _Client = None,
+) -> list[dict[str, Any]]:
+    """POST /datasets/{datasetId}/documents/bulk-fetch
+
+    Synchronously fetch up to 500 documents (with full data) from a
+    dataset in a single call.  This is the fast synchronous companion
+    to the asynchronous :func:`getBulkDownloadURL` pipeline and is
+    intended for small sets (e.g. a subset of IDs returned by
+    :func:`ndiquery`).
+
+    Documents that do not exist, are soft-deleted, or do not belong to
+    the specified dataset are silently omitted from the response.  The
+    order of the returned documents is not guaranteed to match the
+    request order.
+
+    MATLAB equivalent: +cloud/+api/+documents/bulkFetch.m
+
+    Args:
+        dataset_id: The ID of the dataset containing the documents.
+        doc_ids: Document IDs to fetch.  Must be non-empty, at most 500
+            entries, and each entry must be a 24-character hex string.
+        client: Authenticated cloud client (auto-created if omitted).
+
+    Returns:
+        A list of document dicts, each with fields ``id``, ``ndiId``,
+        ``name``, ``className``, ``datasetId``, and ``data``.
+    """
+    if not doc_ids:
+        raise ValueError("doc_ids must be non-empty")
+    if len(doc_ids) > _BULK_FETCH_MAX:
+        raise ValueError(f"doc_ids must have at most {_BULK_FETCH_MAX} entries")
+    for did in doc_ids:
+        if not _HEX24.match(did):
+            raise ValueError(f"doc_ids entries must be 24-character hex strings: {did!r}")
+    result = client.post(
+        "/datasets/{datasetId}/documents/bulk-fetch",
+        json={"documentIds": list(doc_ids)},
+        datasetId=dataset_id,
+    )
+    return result.get("documents", []) if isinstance(result, dict) else list(result or [])
+
+
+@_auto_client
+@validate_call(config=VALIDATE_CONFIG)
+def documentClassCounts(
+    dataset_id: CloudId,
+    *,
+    client: _Client = None,
+) -> dict[str, Any]:
+    """GET /datasets/{datasetId}/document-class-counts
+
+    Retrieve a flat histogram of documents in a dataset grouped by leaf
+    ``data.document_class.class_name``.  No inheritance roll-up is
+    performed; for class-aware drill-downs use :func:`ndiquery` with
+    the ``isa`` operator.
+
+    MATLAB equivalent: +cloud/+api/+documents/documentClassCounts.m
+
+    Args:
+        dataset_id: The ID of the dataset to query.
+        client: Authenticated cloud client (auto-created if omitted).
+
+    Returns:
+        Dict with fields ``datasetId``, ``totalDocuments``, and
+        ``classCounts`` (a mapping of class name to integer count).
+        Documents with missing/empty ``class_name`` are bucketed under
+        ``'unknown'``.
+    """
+    return client.get(
+        "/datasets/{datasetId}/document-class-counts",
+        datasetId=dataset_id,
+    )
+
+
 @_auto_client
 @validate_call(config=VALIDATE_CONFIG)
 def bulkUpload(
diff --git a/src/ndi/cloud/api/ndi_matlab_python_bridge.yaml b/src/ndi/cloud/api/ndi_matlab_python_bridge.yaml
@@ -473,6 +473,53 @@ functions:
       Python convenience that combines getBulkUploadURL + putFiles.
       MATLAB does these as separate steps.
 
+  - name: bulkFetch
+    matlab_path: "+ndi/+cloud/+api/+documents/bulkFetch.m"
+    matlab_last_sync_hash: "bacdd0c3"
+    python_path: "ndi/cloud/api/documents.py"
+    input_arguments:
+      - name: dataset_id
+        type_matlab: "string"
+        type_python: "CloudId"
+      - name: doc_ids
+        type_matlab: "string array"
+        type_python: "list[str]"
+      - name: client
+        type_python: "_Client"
+        default: "None"
+    output_arguments:
+      - name: documents
+        type_python: "list[dict[str, Any]]"
+    decision_log: >
+      Synchronized with MATLAB main as of 2026-04-20. Synchronous bulk
+      fetch of up to 500 documents by ID via POST /datasets/{datasetId}
+      /documents/bulk-fetch. Mirrors MATLAB input validation: non-empty,
+      <= 500 entries, each a 24-character hex string. MATLAB returns
+      (b, answer, apiResponse, apiURL); Python returns only the documents
+      list (the 'answer'), consistent with other api.* wrappers that
+      delegate HTTP metadata to CloudClient.
+
+  - name: documentClassCounts
+    matlab_path: "+ndi/+cloud/+api/+documents/documentClassCounts.m"
+    matlab_last_sync_hash: "12bfe81"
+    python_path: "ndi/cloud/api/documents.py"
+    input_arguments:
+      - name: dataset_id
+        type_matlab: "string"
+        type_python: "CloudId"
+      - name: client
+        type_python: "_Client"
+        default: "None"
+    output_arguments:
+      - name: result
+        type_python: "dict[str, Any]"
+    decision_log: >
+      Synchronized with MATLAB main as of 2026-04-20. GET /datasets/
+      {datasetId}/document-class-counts. Returns a flat histogram of
+      leaf data.document_class.class_name with fields datasetId,
+      totalDocuments, and classCounts (a mapping of class name to int).
+      No inheritance roll-up is performed.
+
   - name: getBulkUploadURL
     matlab_path: "+ndi/+cloud/+api/+documents/getBulkUploadURL.m"
     matlab_last_sync_hash: "9b75c0fe"
diff --git a/src/ndi/gui/gui.py b/src/ndi/gui/gui.py
@@ -139,9 +139,7 @@ def _update_db_list(self) -> None:
             try:
                 from ndi.query import ndi_query
 
-                doc_list = self._session.database_search(
-                    ndi_query("document_class.class_name", "regex", "(.*)", "")
-                )
+                doc_list = self._session.database_search(ndi_query.all())
             except Exception:
                 doc_list = []
 
diff --git a/tests/test_cloud_api_documents.py b/tests/test_cloud_api_documents.py
@@ -0,0 +1,91 @@
+"""Unit tests for ndi.cloud.api.documents — no network required."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import pytest
+
+
+def _make_client() -> MagicMock:
+    """Return a mock CloudClient."""
+    client = MagicMock()
+    client.config.org_id = "org-123"
+    client.config.api_url = "https://api.ndi-cloud.com/v1"
+    return client
+
+
+# --- 24-char hex helper for bulkFetch --------------------------------------
+_HEX24_A = "a" * 24
+_HEX24_B = "b" * 24
+
+
+class TestBulkFetch:
+    """bulkFetch validates inputs and POSTs to /documents/bulk-fetch."""
+
+    def test_returns_documents_list(self):
+        from ndi.cloud.api.documents import bulkFetch
+
+        client = _make_client()
+        client.post.return_value = {
+            "documents": [{"id": _HEX24_A, "name": "d1"}, {"id": _HEX24_B, "name": "d2"}]
+        }
+
+        docs = bulkFetch("ds-1", [_HEX24_A, _HEX24_B], client=client)
+
+        client.post.assert_called_once()
+        call = client.post.call_args
+        assert call.args[0] == "/datasets/{datasetId}/documents/bulk-fetch"
+        assert call.kwargs["datasetId"] == "ds-1"
+        assert call.kwargs["json"] == {"documentIds": [_HEX24_A, _HEX24_B]}
+        assert [d["name"] for d in docs] == ["d1", "d2"]
+
+    def test_empty_doc_ids_raises(self):
+        from ndi.cloud.api.documents import bulkFetch
+
+        with pytest.raises(ValueError, match="non-empty"):
+            bulkFetch("ds-1", [], client=_make_client())
+
+    def test_over_500_raises(self):
+        from ndi.cloud.api.documents import bulkFetch
+
+        ids = [_HEX24_A] * 501
+        with pytest.raises(ValueError, match="at most 500"):
+            bulkFetch("ds-1", ids, client=_make_client())
+
+    def test_non_hex_id_raises(self):
+        from ndi.cloud.api.documents import bulkFetch
+
+        with pytest.raises(ValueError, match="24-character hex"):
+            bulkFetch("ds-1", ["not-a-hex-id"], client=_make_client())
+
+    def test_missing_documents_field_returns_empty(self):
+        from ndi.cloud.api.documents import bulkFetch
+
+        client = _make_client()
+        client.post.return_value = {}
+        docs = bulkFetch("ds-1", [_HEX24_A], client=client)
+        assert docs == []
+
+
+class TestDocumentClassCounts:
+    """documentClassCounts GETs /document-class-counts and returns the struct."""
+
+    def test_returns_response_dict(self):
+        from ndi.cloud.api.documents import documentClassCounts
+
+        client = _make_client()
+        client.get.return_value = {
+            "datasetId": "ds-1",
+            "totalDocuments": 3,
+            "classCounts": {"ndi_document_probe": 2, "unknown": 1},
+        }
+
+        result = documentClassCounts("ds-1", client=client)
+
+        client.get.assert_called_once()
+        call = client.get.call_args
+        assert call.args[0] == "/datasets/{datasetId}/document-class-counts"
+        assert call.kwargs["datasetId"] == "ds-1"
+        assert result["totalDocuments"] == 3
+        assert result["classCounts"]["ndi_document_probe"] == 2
diff --git a/tests/test_cloud_live.py b/tests/test_cloud_live.py
@@ -794,13 +794,9 @@ def test_ndiquery_public(self, client):
         """ndiquery should return documents matching a search."""
         from ndi.cloud.api.documents import ndiquery
 
-        search = [
-            {
-                "field": "document_class.class_name",
-                "operation": "exact_string",
-                "param1": "session",
-            }
-        ]
+        # Class filtering on the cloud must go through the 'isa' operator;
+        # the document_class.class_name field path is no longer searchable.
+        search = [{"field": "", "operation": "isa", "param1": "session"}]
         result = _retry_on_server_error(
             lambda: ndiquery("public", search, page=1, page_size=5, client=client)
         )
@@ -828,13 +824,9 @@ def test_ndiqueryAll_paginates(self, client):
         """ndiqueryAll should auto-paginate results."""
         from ndi.cloud.api.documents import ndiqueryAll
 
-        search = [
-            {
-                "field": "document_class.class_name",
-                "operation": "exact_string",
-                "param1": "session",
-            }
-        ]
+        # Class filtering on the cloud must go through the 'isa' operator;
+        # the document_class.class_name field path is no longer searchable.
+        search = [{"field": "", "operation": "isa", "param1": "session"}]
         result = _retry_on_server_error(
             lambda: ndiqueryAll("public", search, page_size=3, client=client)
         )