Skip to content

Commit 15d5cee

Browse files
committed
Port bulkFetch and documentClassCounts from ndi.cloud.api.documents
Mirrors two new commands added to the MATLAB +ndi/+cloud/+api/+documents namespace. MATLAB routes them through +implementation wrappers that normalize output style; the Python port uses CloudClient for the same role, so no +implementation mirror is needed. INTERFACE UPDATE: Added bulkFetch and documentClassCounts entries to src/ndi/cloud/api/ndi_matlab_python_bridge.yaml. - bulkFetch: POST /datasets/{datasetId}/documents/bulk-fetch; mirrors MATLAB input validation (non-empty, <= 500 entries, 24-char hex IDs) and returns the 'documents' array. - documentClassCounts: GET /datasets/{datasetId}/document-class-counts; returns the datasetId/totalDocuments/classCounts struct.
1 parent 4cd4f97 commit 15d5cee

3 files changed

Lines changed: 222 additions & 0 deletions

File tree

src/ndi/cloud/api/documents.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,17 @@
1010

1111
from __future__ import annotations
1212

13+
import re
1314
from typing import Annotated, Any
1415

1516
from pydantic import SkipValidation, validate_call
1617

1718
from ..client import APIResponse, CloudClient, _auto_client
1819
from ._validators import VALIDATE_CONFIG, CloudId, FilePath, PageNumber, PageSize, Scope
1920

21+
_HEX24 = re.compile(r"^[0-9a-fA-F]{24}$")
22+
_BULK_FETCH_MAX = 500
23+
2024
_Client = Annotated[CloudClient | None, SkipValidation()]
2125

2226

@@ -178,6 +182,86 @@ def countDocuments(dataset_id: CloudId, *, client: _Client = None) -> int:
178182
return ds.get("documentCount", 0)
179183

180184

185+
@_auto_client
186+
@validate_call(config=VALIDATE_CONFIG)
187+
def bulkFetch(
188+
dataset_id: CloudId,
189+
doc_ids: list[str],
190+
*,
191+
client: _Client = None,
192+
) -> list[dict[str, Any]]:
193+
"""POST /datasets/{datasetId}/documents/bulk-fetch
194+
195+
Synchronously fetch up to 500 documents (with full data) from a
196+
dataset in a single call. This is the fast synchronous companion
197+
to the asynchronous :func:`getBulkDownloadURL` pipeline and is
198+
intended for small sets (e.g. a subset of IDs returned by
199+
:func:`ndiquery`).
200+
201+
Documents that do not exist, are soft-deleted, or do not belong to
202+
the specified dataset are silently omitted from the response. The
203+
order of the returned documents is not guaranteed to match the
204+
request order.
205+
206+
MATLAB equivalent: +cloud/+api/+documents/bulkFetch.m
207+
208+
Args:
209+
dataset_id: The ID of the dataset containing the documents.
210+
doc_ids: Document IDs to fetch. Must be non-empty, at most 500
211+
entries, and each entry must be a 24-character hex string.
212+
client: Authenticated cloud client (auto-created if omitted).
213+
214+
Returns:
215+
A list of document dicts, each with fields ``id``, ``ndiId``,
216+
``name``, ``className``, ``datasetId``, and ``data``.
217+
"""
218+
if not doc_ids:
219+
raise ValueError("doc_ids must be non-empty")
220+
if len(doc_ids) > _BULK_FETCH_MAX:
221+
raise ValueError(f"doc_ids must have at most {_BULK_FETCH_MAX} entries")
222+
for did in doc_ids:
223+
if not _HEX24.match(did):
224+
raise ValueError(f"doc_ids entries must be 24-character hex strings: {did!r}")
225+
result = client.post(
226+
"/datasets/{datasetId}/documents/bulk-fetch",
227+
json={"documentIds": list(doc_ids)},
228+
datasetId=dataset_id,
229+
)
230+
return result.get("documents", []) if isinstance(result, dict) else list(result or [])
231+
232+
233+
@_auto_client
234+
@validate_call(config=VALIDATE_CONFIG)
235+
def documentClassCounts(
236+
dataset_id: CloudId,
237+
*,
238+
client: _Client = None,
239+
) -> dict[str, Any]:
240+
"""GET /datasets/{datasetId}/document-class-counts
241+
242+
Retrieve a flat histogram of documents in a dataset grouped by leaf
243+
``data.document_class.class_name``. No inheritance roll-up is
244+
performed; for class-aware drill-downs use :func:`ndiquery` with
245+
the ``isa`` operator.
246+
247+
MATLAB equivalent: +cloud/+api/+documents/documentClassCounts.m
248+
249+
Args:
250+
dataset_id: The ID of the dataset to query.
251+
client: Authenticated cloud client (auto-created if omitted).
252+
253+
Returns:
254+
Dict with fields ``datasetId``, ``totalDocuments``, and
255+
``classCounts`` (a mapping of class name to integer count).
256+
Documents with missing/empty ``class_name`` are bucketed under
257+
``'unknown'``.
258+
"""
259+
return client.get(
260+
"/datasets/{datasetId}/document-class-counts",
261+
datasetId=dataset_id,
262+
)
263+
264+
181265
@_auto_client
182266
@validate_call(config=VALIDATE_CONFIG)
183267
def bulkUpload(

src/ndi/cloud/api/ndi_matlab_python_bridge.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,53 @@ functions:
473473
Python convenience that combines getBulkUploadURL + putFiles.
474474
MATLAB does these as separate steps.
475475
476+
- name: bulkFetch
477+
matlab_path: "+ndi/+cloud/+api/+documents/bulkFetch.m"
478+
matlab_last_sync_hash: "bacdd0c3"
479+
python_path: "ndi/cloud/api/documents.py"
480+
input_arguments:
481+
- name: dataset_id
482+
type_matlab: "string"
483+
type_python: "CloudId"
484+
- name: doc_ids
485+
type_matlab: "string array"
486+
type_python: "list[str]"
487+
- name: client
488+
type_python: "_Client"
489+
default: "None"
490+
output_arguments:
491+
- name: documents
492+
type_python: "list[dict[str, Any]]"
493+
decision_log: >
494+
Synchronized with MATLAB main as of 2026-04-20. Synchronous bulk
495+
fetch of up to 500 documents by ID via POST /datasets/{datasetId}
496+
/documents/bulk-fetch. Mirrors MATLAB input validation: non-empty,
497+
<= 500 entries, each a 24-character hex string. MATLAB returns
498+
(b, answer, apiResponse, apiURL); Python returns only the documents
499+
list (the 'answer'), consistent with other api.* wrappers that
500+
delegate HTTP metadata to CloudClient.
501+
502+
- name: documentClassCounts
503+
matlab_path: "+ndi/+cloud/+api/+documents/documentClassCounts.m"
504+
matlab_last_sync_hash: "12bfe81"
505+
python_path: "ndi/cloud/api/documents.py"
506+
input_arguments:
507+
- name: dataset_id
508+
type_matlab: "string"
509+
type_python: "CloudId"
510+
- name: client
511+
type_python: "_Client"
512+
default: "None"
513+
output_arguments:
514+
- name: result
515+
type_python: "dict[str, Any]"
516+
decision_log: >
517+
Synchronized with MATLAB main as of 2026-04-20. GET /datasets/
518+
{datasetId}/document-class-counts. Returns a flat histogram of
519+
leaf data.document_class.class_name with fields datasetId,
520+
totalDocuments, and classCounts (a mapping of class name to int).
521+
No inheritance roll-up is performed.
522+
476523
- name: getBulkUploadURL
477524
matlab_path: "+ndi/+cloud/+api/+documents/getBulkUploadURL.m"
478525
matlab_last_sync_hash: "9b75c0fe"

tests/test_cloud_api_documents.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""Unit tests for ndi.cloud.api.documents — no network required."""
2+
3+
from __future__ import annotations
4+
5+
from unittest.mock import MagicMock
6+
7+
import pytest
8+
9+
10+
def _make_client() -> MagicMock:
11+
"""Return a mock CloudClient."""
12+
client = MagicMock()
13+
client.config.org_id = "org-123"
14+
client.config.api_url = "https://api.ndi-cloud.com/v1"
15+
return client
16+
17+
18+
# --- 24-char hex helper for bulkFetch --------------------------------------
19+
_HEX24_A = "a" * 24
20+
_HEX24_B = "b" * 24
21+
22+
23+
class TestBulkFetch:
24+
"""bulkFetch validates inputs and POSTs to /documents/bulk-fetch."""
25+
26+
def test_returns_documents_list(self):
27+
from ndi.cloud.api.documents import bulkFetch
28+
29+
client = _make_client()
30+
client.post.return_value = {
31+
"documents": [{"id": _HEX24_A, "name": "d1"}, {"id": _HEX24_B, "name": "d2"}]
32+
}
33+
34+
docs = bulkFetch("ds-1", [_HEX24_A, _HEX24_B], client=client)
35+
36+
client.post.assert_called_once()
37+
call = client.post.call_args
38+
assert call.args[0] == "/datasets/{datasetId}/documents/bulk-fetch"
39+
assert call.kwargs["datasetId"] == "ds-1"
40+
assert call.kwargs["json"] == {"documentIds": [_HEX24_A, _HEX24_B]}
41+
assert [d["name"] for d in docs] == ["d1", "d2"]
42+
43+
def test_empty_doc_ids_raises(self):
44+
from ndi.cloud.api.documents import bulkFetch
45+
46+
with pytest.raises(ValueError, match="non-empty"):
47+
bulkFetch("ds-1", [], client=_make_client())
48+
49+
def test_over_500_raises(self):
50+
from ndi.cloud.api.documents import bulkFetch
51+
52+
ids = [_HEX24_A] * 501
53+
with pytest.raises(ValueError, match="at most 500"):
54+
bulkFetch("ds-1", ids, client=_make_client())
55+
56+
def test_non_hex_id_raises(self):
57+
from ndi.cloud.api.documents import bulkFetch
58+
59+
with pytest.raises(ValueError, match="24-character hex"):
60+
bulkFetch("ds-1", ["not-a-hex-id"], client=_make_client())
61+
62+
def test_missing_documents_field_returns_empty(self):
63+
from ndi.cloud.api.documents import bulkFetch
64+
65+
client = _make_client()
66+
client.post.return_value = {}
67+
docs = bulkFetch("ds-1", [_HEX24_A], client=client)
68+
assert docs == []
69+
70+
71+
class TestDocumentClassCounts:
72+
"""documentClassCounts GETs /document-class-counts and returns the struct."""
73+
74+
def test_returns_response_dict(self):
75+
from ndi.cloud.api.documents import documentClassCounts
76+
77+
client = _make_client()
78+
client.get.return_value = {
79+
"datasetId": "ds-1",
80+
"totalDocuments": 3,
81+
"classCounts": {"ndi_document_probe": 2, "unknown": 1},
82+
}
83+
84+
result = documentClassCounts("ds-1", client=client)
85+
86+
client.get.assert_called_once()
87+
call = client.get.call_args
88+
assert call.args[0] == "/datasets/{datasetId}/document-class-counts"
89+
assert call.kwargs["datasetId"] == "ds-1"
90+
assert result["totalDocuments"] == 3
91+
assert result["classCounts"]["ndi_document_probe"] == 2

0 commit comments

Comments
 (0)