Skip to content

Commit bd3334b

Browse files
Merge pull request #53 from Waltham-Data-Science/claude/add-cloud-dataset-test-ENbIP
Add cloud dataset integration test for reading ingested data
2 parents 43ed8d2 + 6dce4b2 commit bd3334b

22 files changed

Lines changed: 1192 additions & 303 deletions

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ jobs:
5252
run: python -m ndi check
5353

5454
- name: Run tests with coverage
55+
env:
56+
NDI_CLOUD_USERNAME: ${{ secrets.TEST_USER_2_USERNAME }}
57+
NDI_CLOUD_PASSWORD: ${{ secrets.TEST_USER_2_PASSWORD }}
5558
run: |
5659
# Use sys.monitoring (PEP 669) on Python 3.12+ for faster coverage.
5760
# CTracer (sys.settrace) is catastrophically slow on 3.12 when

ndi_install.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@
4141
"python_path": ".",
4242
"description": "VH-Lab data utilities and file formats (not on PyPI)",
4343
},
44+
{
45+
"name": "NDIcalc-vis-matlab",
46+
"repo": "https://github.com/VH-Lab/NDIcalc-vis-matlab.git",
47+
"branch": "main",
48+
"python_path": "",
49+
"ndi_common": True,
50+
"description": "NDI calculator and visualization document definitions",
51+
},
4452
]
4553

4654
DEFAULT_TOOLS_DIR = Path.home() / ".ndi" / "tools"
@@ -268,6 +276,8 @@ def write_pth_file(site_packages: Path, tools_dir: Path) -> Path | None:
268276
lines = []
269277

270278
for dep in DEPENDENCIES:
279+
if not dep.get("python_path"):
280+
continue # No Python code to add to path
271281
dep_dir = tools_dir / dep["name"]
272282
python_path = dep_dir / dep["python_path"] if dep["python_path"] != "." else dep_dir
273283
if python_path.is_dir():
@@ -290,6 +300,56 @@ def write_pth_file(site_packages: Path, tools_dir: Path) -> Path | None:
290300
return None
291301

292302

303+
# ---------------------------------------------------------------------------
304+
# ndi_common document definitions from external dependencies
305+
# ---------------------------------------------------------------------------
306+
307+
308+
def install_ndi_common_docs(tools_dir: Path, ndi_root: Path) -> bool:
309+
"""Copy ndi_common/{database,schema}_documents from external deps.
310+
311+
Some dependencies (e.g. NDIcalc-vis-matlab) ship document type
312+
definitions that NDI-python needs at runtime. This copies their
313+
``ndi_common/database_documents`` and ``ndi_common/schema_documents``
314+
trees into NDI-python's own ``ndi_common`` folder so they are
315+
discoverable via ``ndi_common_PathConstants.DOCUMENT_PATH``.
316+
"""
317+
import shutil
318+
319+
ndi_common = ndi_root / "src" / "ndi" / "ndi_common"
320+
ok = True
321+
322+
for dep in DEPENDENCIES:
323+
if not dep.get("ndi_common"):
324+
continue
325+
dep_dir = tools_dir / dep["name"]
326+
dep_common = dep_dir / "ndi_common"
327+
if not dep_common.is_dir():
328+
warn(f"{dep['name']}: ndi_common folder not found at {dep_common}")
329+
ok = False
330+
continue
331+
332+
for sub in ("database_documents", "schema_documents"):
333+
src = dep_common / sub
334+
dst = ndi_common / sub
335+
if not src.is_dir():
336+
continue
337+
count = 0
338+
for src_file in src.rglob("*"):
339+
if src_file.is_dir():
340+
continue
341+
rel = src_file.relative_to(src)
342+
dst_file = dst / rel
343+
dst_file.parent.mkdir(parents=True, exist_ok=True)
344+
shutil.copy2(src_file, dst_file)
345+
count += 1
346+
detail(f"Copied {count} {sub} files from {dep['name']}")
347+
348+
success(f"Installed document definitions from {dep['name']}")
349+
350+
return ok
351+
352+
293353
# ---------------------------------------------------------------------------
294354
# pip installation
295355
# ---------------------------------------------------------------------------
@@ -529,6 +589,8 @@ def main() -> int:
529589
fail("Could not find site-packages directory")
530590
warn("You may need to set PYTHONPATH manually:")
531591
for dep in DEPENDENCIES:
592+
if not dep.get("python_path"):
593+
continue
532594
dep_dir = tools_dir / dep["name"]
533595
python_path = dep_dir / dep["python_path"] if dep["python_path"] != "." else dep_dir
534596
warn(f" {python_path}")
@@ -546,6 +608,8 @@ def main() -> int:
546608
importlib.reload(site)
547609
# Add paths directly for this process
548610
for dep in DEPENDENCIES:
611+
if not dep.get("python_path"):
612+
continue # No Python code to add to path
549613
dep_dir = tools_dir / dep["name"]
550614
python_path = (
551615
str(dep_dir / dep["python_path"]) if dep["python_path"] != "." else str(dep_dir)
@@ -564,6 +628,9 @@ def main() -> int:
564628
if not install_ndi_and_deps(ndi_root, include_dev=args.dev):
565629
warn("Some packages may not have installed correctly")
566630

631+
# Copy document definitions from external dependencies
632+
install_ndi_common_docs(tools_dir, ndi_root)
633+
567634
# ── Step 5: Validate ───────────────────────────────────────────────
568635
if args.no_validate:
569636
print("\n[5/5] Validation skipped (--no-validate)")

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ dependencies = [
3838
"did @ git+https://github.com/VH-Lab/DID-python.git@main",
3939
"ndr @ git+https://github.com/VH-lab/NDR-python.git@main",
4040
"vhlab-toolbox-python @ git+https://github.com/VH-Lab/vhlab-toolbox-python.git@main",
41+
"ndi-compress @ git+https://github.com/Waltham-Data-Science/NDI-compress-python.git@main",
4142
"numpy>=1.20.0",
4243
"networkx>=2.6",
4344
"jsonschema>=4.0.0",

src/ndi/cloud/orchestration.py

Lines changed: 55 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ def downloadDataset(
8484
from ndi.dataset import ndi_dataset_dir
8585

8686
documents = jsons2documents(doc_jsons)
87-
conversion_lost = len(doc_jsons) - len(documents)
8887
dataset = ndi_dataset_dir("", target, documents=documents)
8988

9089
# Create remote link document if not already present
@@ -113,85 +112,76 @@ def downloadDataset(
113112
if verbose:
114113
print(f' Files downloaded: {report["downloaded"]}, failed: {report["failed"]}')
115114

116-
# Collect failures: conversion + exception-tracked + silent (DID-python)
117-
add_failures: list[tuple[str, str]] = list(getattr(dataset, "add_doc_failures", []))
118-
119-
# Cross-check using raw DID-python doc IDs (not isa('base') query,
120-
# which might miss documents whose type info wasn't stored correctly).
115+
# Verify every downloaded document made it into the local database.
116+
# The local dataset may have *more* documents (e.g. session and
117+
# session-in-a-dataset docs created internally), so we only check
118+
# that every remote doc ID is present locally.
121119
db_ids = set(
122120
dataset._session._database._driver._db.get_doc_ids(
123121
dataset._session._database._driver._branch_id
124122
)
125123
)
126124

127-
# Build a map from doc_id -> original JSON for missing-doc output
128-
doc_json_by_id: dict[str, dict] = {}
125+
missing: list[str] = []
126+
missing_jsons: list[dict] = []
129127
for dj in doc_jsons:
130128
did = dj.get("base", {}).get("id", "") if isinstance(dj, dict) else ""
131-
if did:
132-
doc_json_by_id[did] = dj
133-
134-
# Find documents that were "added" (no exception) but aren't in the DB
135-
tracked_ids = {f[0] for f in add_failures}
136-
silent_failures: list[str] = []
137-
for doc in documents:
138-
doc_id = (
139-
doc.document_properties.get("base", {}).get("id", "")
140-
if hasattr(doc, "document_properties")
141-
else doc.get("base", {}).get("id", "")
142-
)
143-
if doc_id and doc_id not in db_ids and doc_id not in tracked_ids:
144-
silent_failures.append(doc_id)
145-
146-
total_lost = conversion_lost + len(add_failures) + len(silent_failures)
129+
if did and did not in db_ids:
130+
missing.append(did)
131+
missing_jsons.append(dj)
147132

148133
if verbose:
149134
print("Download complete.")
150135

151-
if total_lost > 0:
152-
# Write missing documents to a JSON file for inspection
153-
missing_docs_path = target / "missingDocuments.json"
154-
missing_docs = []
155-
for doc_id in silent_failures:
156-
if doc_id in doc_json_by_id:
157-
missing_docs.append(doc_json_by_id[doc_id])
136+
if missing:
137+
# Print the document_class of each missing doc for diagnostics.
138+
# Session/dataset docs from older datasets are expected to be
139+
# missing (superseded by docs created locally during dataset init).
140+
session_dataset_types = {
141+
"ndi_session",
142+
"ndi_dataset",
143+
"session",
144+
"dataset",
145+
"session_in_a_dataset",
146+
"dataset_session_info",
147+
}
148+
real_missing: list[tuple[str, str]] = []
149+
for doc_id, dj in zip(missing, missing_jsons):
150+
doc_class = (
151+
dj.get("document_class", {}).get("class_name", "") if isinstance(dj, dict) else ""
152+
)
153+
superclasses = (
154+
dj.get("document_class", {}).get("superclasses", []) if isinstance(dj, dict) else []
155+
)
156+
all_types = {doc_class} | {
157+
sc.get("class_name", "") if isinstance(sc, dict) else str(sc)
158+
for sc in (superclasses if isinstance(superclasses, list) else [])
159+
}
160+
if all_types & session_dataset_types:
161+
print(
162+
f" Note: remote doc {doc_id} (class: {doc_class}) "
163+
f"not in local DB — expected for session/dataset docs"
164+
)
158165
else:
159-
missing_docs.append({"base": {"id": doc_id}})
160-
for doc_id, reason in add_failures:
161-
entry = dict(doc_json_by_id.get(doc_id, {"base": {"id": doc_id}}))
162-
entry["_add_error"] = reason
163-
missing_docs.append(entry)
164-
if missing_docs:
165-
import json
166+
print(f" WARNING: remote doc {doc_id} (class: {doc_class}) missing from local DB")
167+
real_missing.append((doc_id, doc_class))
166168

167-
missing_docs_path.write_text(json.dumps(missing_docs, indent=2, default=str))
169+
if real_missing:
170+
missing_docs_path = target / "missingDocuments.json"
171+
import json
168172

169-
lines = [
170-
f"Downloaded {len(doc_jsons)} documents but only "
171-
f"{len(db_ids)} were added to the dataset. "
172-
f"{total_lost} document(s) lost:"
173-
]
174-
if conversion_lost > 0:
175-
lines.append(f"\n{conversion_lost} failed to convert from JSON" " to ndi_document")
176-
if add_failures:
177-
lines.append(f"\n{len(add_failures)} raised errors during" " database add:")
178-
for doc_id, reason in add_failures[:50]:
179-
lines.append(f"\n - {doc_id}: {reason}")
180-
if len(add_failures) > 50:
181-
lines.append(f"\n ... and {len(add_failures) - 50} more")
182-
if silent_failures:
183-
lines.append(
184-
f"\n{len(silent_failures)} were passed to"
185-
" database.add() without error but are NOT in the"
186-
" database (possible DID-python bug):"
187-
)
188-
for doc_id in silent_failures[:50]:
189-
lines.append(f"\n - {doc_id}")
190-
if len(silent_failures) > 50:
191-
lines.append(f"\n ... and {len(silent_failures) - 50} more")
192-
if missing_docs:
193-
lines.append(f"\nFull JSON of missing documents written to:" f"\n {missing_docs_path}")
194-
raise RuntimeError("".join(lines))
173+
missing_docs_path.write_text(json.dumps(missing_jsons, indent=2, default=str))
174+
175+
lines = [
176+
f"Downloaded {len(doc_jsons)} documents but "
177+
f"{len(real_missing)} are missing from the local dataset:"
178+
]
179+
for doc_id, doc_class in real_missing[:50]:
180+
lines.append(f"\n - {doc_id} (class: {doc_class})")
181+
if len(real_missing) > 50:
182+
lines.append(f"\n ... and {len(real_missing) - 50} more")
183+
lines.append(f"\nFull JSON of missing documents written to:\n {missing_docs_path}")
184+
raise RuntimeError("".join(lines))
195185

196186
return dataset
197187

0 commit comments

Comments
 (0)