|
10 | 10 |
|
11 | 11 | from __future__ import annotations |
12 | 12 |
|
| 13 | +import re |
13 | 14 | from typing import Annotated, Any |
14 | 15 |
|
15 | 16 | from pydantic import SkipValidation, validate_call |
16 | 17 |
|
17 | 18 | from ..client import APIResponse, CloudClient, _auto_client |
18 | 19 | from ._validators import VALIDATE_CONFIG, CloudId, FilePath, PageNumber, PageSize, Scope |
19 | 20 |
|
| 21 | +_HEX24 = re.compile(r"^[0-9a-fA-F]{24}$") |
| 22 | +_BULK_FETCH_MAX = 500 |
| 23 | + |
20 | 24 | _Client = Annotated[CloudClient | None, SkipValidation()] |
21 | 25 |
|
22 | 26 |
|
@@ -178,6 +182,86 @@ def countDocuments(dataset_id: CloudId, *, client: _Client = None) -> int: |
178 | 182 | return ds.get("documentCount", 0) |
179 | 183 |
|
180 | 184 |
|
| 185 | +@_auto_client |
| 186 | +@validate_call(config=VALIDATE_CONFIG) |
| 187 | +def bulkFetch( |
| 188 | + dataset_id: CloudId, |
| 189 | + doc_ids: list[str], |
| 190 | + *, |
| 191 | + client: _Client = None, |
| 192 | +) -> list[dict[str, Any]]: |
| 193 | + """POST /datasets/{datasetId}/documents/bulk-fetch |
| 194 | +
|
| 195 | + Synchronously fetch up to 500 documents (with full data) from a |
| 196 | + dataset in a single call. This is the fast synchronous companion |
| 197 | + to the asynchronous :func:`getBulkDownloadURL` pipeline and is |
| 198 | + intended for small sets (e.g. a subset of IDs returned by |
| 199 | + :func:`ndiquery`). |
| 200 | +
|
| 201 | + Documents that do not exist, are soft-deleted, or do not belong to |
| 202 | + the specified dataset are silently omitted from the response. The |
| 203 | + order of the returned documents is not guaranteed to match the |
| 204 | + request order. |
| 205 | +
|
| 206 | + MATLAB equivalent: +cloud/+api/+documents/bulkFetch.m |
| 207 | +
|
| 208 | + Args: |
| 209 | + dataset_id: The ID of the dataset containing the documents. |
| 210 | + doc_ids: Document IDs to fetch. Must be non-empty, at most 500 |
| 211 | + entries, and each entry must be a 24-character hex string. |
| 212 | + client: Authenticated cloud client (auto-created if omitted). |
| 213 | +
|
| 214 | + Returns: |
| 215 | + A list of document dicts, each with fields ``id``, ``ndiId``, |
| 216 | + ``name``, ``className``, ``datasetId``, and ``data``. |
| 217 | + """ |
| 218 | + if not doc_ids: |
| 219 | + raise ValueError("doc_ids must be non-empty") |
| 220 | + if len(doc_ids) > _BULK_FETCH_MAX: |
| 221 | + raise ValueError(f"doc_ids must have at most {_BULK_FETCH_MAX} entries") |
| 222 | + for did in doc_ids: |
| 223 | + if not _HEX24.match(did): |
| 224 | + raise ValueError(f"doc_ids entries must be 24-character hex strings: {did!r}") |
| 225 | + result = client.post( |
| 226 | + "/datasets/{datasetId}/documents/bulk-fetch", |
| 227 | + json={"documentIds": list(doc_ids)}, |
| 228 | + datasetId=dataset_id, |
| 229 | + ) |
| 230 | + return result.get("documents", []) if isinstance(result, dict) else list(result or []) |
| 231 | + |
| 232 | + |
| 233 | +@_auto_client |
| 234 | +@validate_call(config=VALIDATE_CONFIG) |
| 235 | +def documentClassCounts( |
| 236 | + dataset_id: CloudId, |
| 237 | + *, |
| 238 | + client: _Client = None, |
| 239 | +) -> dict[str, Any]: |
| 240 | + """GET /datasets/{datasetId}/document-class-counts |
| 241 | +
|
| 242 | + Retrieve a flat histogram of documents in a dataset grouped by leaf |
| 243 | + ``data.document_class.class_name``. No inheritance roll-up is |
| 244 | + performed; for class-aware drill-downs use :func:`ndiquery` with |
| 245 | + the ``isa`` operator. |
| 246 | +
|
| 247 | + MATLAB equivalent: +cloud/+api/+documents/documentClassCounts.m |
| 248 | +
|
| 249 | + Args: |
| 250 | + dataset_id: The ID of the dataset to query. |
| 251 | + client: Authenticated cloud client (auto-created if omitted). |
| 252 | +
|
| 253 | + Returns: |
| 254 | + Dict with fields ``datasetId``, ``totalDocuments``, and |
| 255 | + ``classCounts`` (a mapping of class name to integer count). |
| 256 | + Documents with missing/empty ``class_name`` are bucketed under |
| 257 | + ``'unknown'``. |
| 258 | + """ |
| 259 | + return client.get( |
| 260 | + "/datasets/{datasetId}/document-class-counts", |
| 261 | + datasetId=dataset_id, |
| 262 | + ) |
| 263 | + |
| 264 | + |
181 | 265 | @_auto_client |
182 | 266 | @validate_call(config=VALIDATE_CONFIG) |
183 | 267 | def bulkUpload( |
|
0 commit comments