From b39c4229a692eb0a6d53471d91786a660ecf916f Mon Sep 17 00:00:00 2001 From: audriB Date: Mon, 27 Apr 2026 11:31:21 -0400 Subject: [PATCH] perf(summary): use dataset-record fields when /document-class-counts times out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smoke-test pass after PRs #101/#102/#103 found that for the largest production datasets (Jess Haley 78k docs, Sophie Griswold 101k docs) the /document-class-counts endpoint still exhausts its 20s deadline on every synthesis attempt — the cloud's Mongo aggregation on className without proper indexes simply doesn't fit in the budget. Net effect: every cron warm cycle landed a DEGRADED summary with all zero counts AND null per-class facts (because counts.subjects=0 short-circuited stage 2's openminds_subject fanout). The DatasetSummaryCard rendered "0 sessions, 0 subjects, ..., Not applicable" everywhere, even though the dataset record itself reports `numberOfSubjects=1656` and `documentCount=78687`. The frontend band-aid (Waltham-Data-Science/ndi-cloud-app#91) hides this UX-side by enriching the degraded summary with record fields client-side. This PR is the real fix at the BACKEND: when stage-1 counts times out, synthesize a counts envelope from the dataset record's `numberOfSubjects` + `documentCount` and gate stage 2 from those record fields directly. Stage 2 still attempts the per-class fanouts (openminds_subject for species/strains/sexes, probe_location for brainRegions, element for probeTypes), bounded by their existing 25s per-class deadlines. ## Worst-case timing stage 1: 20s (counts + dataset record in parallel — dataset succeeds fast; counts hits the deadline) stage 2: 25s (3 classes in parallel — each bounded by per-class deadline; ndiquery for openminds_subject can succeed even when class-counts is slow because they hit different cloud endpoints) ontology: ~2s total: ~47s (well under Railway's 88s ceiling) ## What the user sees, before vs after Before: counts: { sessions: 0, subjects: 0, probes: 0, elements: 0, epochs: 0, totalDocuments: 0 } species: null brainRegions: null extractionWarnings: ["class counts query failed: ..."] After (Jess Haley): counts: { sessions: 0, subjects: 1656, probes: 0, elements: 0, epochs: 0, totalDocuments: 78687 } species: [Caenorhabditis elegans, Escherichia coli] ← FROM cloud brainRegions: [whole nervous system] ← FROM cloud strains/sexes: real data when available extractionWarnings: ["class counts query failed: ..."] ↑ subjects + totalDocuments come from dataset record; species/brainRegions/strains/sexes come from REAL stage-2 ndiquery+bulk_fetch (the per-class queries succeed in isolation; only /document-class-counts is pathologically slow). ## Caveat: per-class counts (sessions/probes/elements/epochs) stay 0 The dataset record doesn't expose these fields, so when stage-1 counts times out we can't populate them. They display as 0 with the "X warnings" tooltip explaining the underlying cause. A future iteration could optionally compute these from `/dataset/:id/documents?class=...&pageSize=1` reading the response envelope's `total` field, but that's 4 extra cloud calls per degraded synthesis — not worth it for fields the user rarely notices vs the now-restored species/brainRegions facts. ## Coverage - test_stage_1_counts_timeout_still_runs_stage_2_via_record_fields: end-to-end pin asserting that with counts timed out + record fields populated, stage 2 attempts and succeeds, species comes back populated. - test_safe_record_int_handles_all_input_shapes: defensive helper accepts any input shape (None, dict-with-null, dict-with-string, negative int, missing key) and degrades to 0. ## Pairs with frontend Combined with ndi-cloud-app#91 (record-fallback enrichment) + ndi-cloud-app#92 (progressive document loading), the user-perceived loading experience for large datasets is now: • Hero band renders instantly (raw record fields) • Summary card renders with real subjects + totalDocuments + species + brainRegions within ~25-45s of cold synthesis • Document Explorer shows first 50 rows immediately, more as user scrolls • Subsequent viewers within 24h get sub-second cache-hit responses (PR #103 differential TTL holds full successes longer) Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/services/dataset_summary_service.py | 92 +++++++++++++-- .../unit/test_dataset_summary_service.py | 106 ++++++++++++++++++ 2 files changed, 188 insertions(+), 10 deletions(-) diff --git a/backend/services/dataset_summary_service.py b/backend/services/dataset_summary_service.py index 72a17be..27abc0d 100644 --- a/backend/services/dataset_summary_service.py +++ b/backend/services/dataset_summary_service.py @@ -423,16 +423,52 @@ async def _build( f"class counts query failed: counts fetch exceeded " f"{STAGE_1_FETCH_TIMEOUT_SECONDS}s", ) - # Zero-counts fallback: stage 2 sees `subjects_present=False` - # and `probe_present=False`, so the per-class fetches all - # short-circuit to `_empty_list()` and stage 2 finishes in - # microseconds. Net effect: a degraded summary with all - # facts None / 0 + two extraction warnings (counts + the - # implicit "no data so we didn't fan out" state). + # Synthesize a counts envelope from the dataset record's + # raw fields (``numberOfSubjects``, ``documentCount``) + # when the cloud's `/document-class-counts` endpoint + # times out. This is critical for large datasets (101k+ + # docs) where /document-class-counts is THE perf bottleneck: + # without this fallback, ``subjects_present`` defaults to + # False, stage 2 short-circuits, and the user sees a + # degraded summary even though the dataset record itself + # has the subject count right there. + # + # The classCounts blob only includes ``subject`` (mapped + # from ``numberOfSubjects``) — the cloud's class-counts + # endpoint is the ONLY canonical source for per-class + # buckets like sessions/probes/elements/epochs, and we + # don't have those on the dataset record. Setting them to + # 0 keeps `_counts_from_raw` happy; their absence is + # already documented via the warning so an operator sees + # the cause-and-effect. + # + # `probe_present` upstream uses `counts.probes > 0 or + # counts.elements > 0` — neither is exposed on the + # dataset record. We bridge that by also signaling probe + # presence whenever ``documentCount > 0``: if the dataset + # has any documents at all, it's worth attempting the + # probe_location + element fanout. Worst case the + # ndiquery there times out at the per-class deadline and + # we fall back to None for those facts. Best case we land + # full data despite the counts timeout. + record_subjects = _safe_record_int(dataset_raw, "numberOfSubjects") + record_docs = _safe_record_int(dataset_raw, "documentCount") counts_raw = { "datasetId": dataset_id, - "totalDocuments": 0, - "classCounts": {}, + # Subjects + totalDocuments come straight from the + # dataset record. The per-class buckets stay empty — + # we don't have authoritative numbers for those. The + # later stage-2 gating reads `dataset_raw` directly + # via `_record_says_has_data()` to decide whether to + # attempt the probe_location + element ndiqueries + # despite the empty `probes`/`elements` counts (we + # don't want to fake those numbers — keeping the + # honest 0 means the UI doesn't claim per-class + # counts we never measured). + "totalDocuments": record_docs, + "classCounts": ( + {"subject": record_subjects} if record_subjects > 0 else {} + ), } else: # Mypy narrows the union via the isinstance check above. @@ -445,8 +481,25 @@ async def _build( # brainRegions; element (primary=probe-like) → probeTypes. These all # share the dataset scope so we parallelize via asyncio.gather with # a shared semaphore bounding bulk-fetch concurrency. - subjects_present = counts.subjects > 0 - probe_present = counts.probes > 0 or counts.elements > 0 + # + # Gating: prefer the canonical counts (from /document-class-counts) + # when available; fall back to the dataset record's raw fields + # (``numberOfSubjects``, ``documentCount``) when stage-1 counts + # timed out. Without this fallback, every counts-timeout dataset + # would short-circuit stage 2 even when the record itself + # confirms subjects/documents exist — the user-perceived bug + # was Jess Haley's species/brainRegions rendering as `null` on + # the Summary card despite the hero band right above showing + # "78,687 documents · 1,656 subjects · Caenorhabditis elegans". + subjects_present = ( + counts.subjects > 0 + or _safe_record_int(dataset_raw, "numberOfSubjects") > 0 + ) + probe_present = ( + counts.probes > 0 + or counts.elements > 0 + or _safe_record_int(dataset_raw, "documentCount") > 0 + ) if subjects_present: om_task = self._fetch_class_bounded( @@ -713,6 +766,25 @@ def _result_or_warn( return cast(list[dict[str, Any]], result) +def _safe_record_int(raw: Any, field: str) -> int: + """Read an integer field off a dataset-record dict, returning 0 on + any of: not-a-dict, missing field, non-int value, or negative. + + Used by the stage-1 counts-timeout fallback path so a missing + ``numberOfSubjects`` (e.g. Sophie Griswold's record where the cloud + didn't populate it) cleanly degrades to 0 instead of a TypeError. + The 0 case still lets the Summary card render — it just leaves + the corresponding counts column at 0, matching what the cloud's + own /datasets/:id response says. + """ + if not isinstance(raw, dict): + return 0 + value = raw.get(field) + if isinstance(value, int) and value >= 0: + return value + return 0 + + def _counts_from_raw(raw: dict[str, Any]) -> DatasetSummaryCounts: """``/document-class-counts`` returns ``{datasetId, totalDocuments, classCounts: {class_name: n}}``. We map the canonical classes; any diff --git a/backend/tests/unit/test_dataset_summary_service.py b/backend/tests/unit/test_dataset_summary_service.py index 2c96fe2..f96816a 100644 --- a/backend/tests/unit/test_dataset_summary_service.py +++ b/backend/tests/unit/test_dataset_summary_service.py @@ -713,6 +713,112 @@ async def _slow_counts( assert "exceeded 0.05s" in warnings_text +# --------------------------------------------------------------------------- +# Record-fallback gating for stage-2 fanout +# +# Smoke-test follow-up: when stage-1 /document-class-counts times out, +# stage 2 was short-circuiting entirely (subjects_present=False, +# probe_present=False) — even when the dataset record itself reports +# `numberOfSubjects > 0` and `documentCount > 0`. Net effect: a degraded +# summary with null per-class facts despite the dataset record having +# enough information to gate the fanout. Fix uses record fields as +# fallback when counts is degraded so stage 2 still attempts the +# openminds_subject + probe_location + element ndiqueries. +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_stage_1_counts_timeout_still_runs_stage_2_via_record_fields( + cloud: NdiCloudClient, + ontology_service: OntologyService, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When `/document-class-counts` times out but the dataset record + has `numberOfSubjects > 0`, stage 2 must still attempt the + openminds_subject + probe_location + element fanout. Without this + fallback the summary degrades to null facts even on datasets where + the record-level signal is clear. + """ + import asyncio as _asyncio + + import backend.services.dataset_summary_service as svc_module + + monkeypatch.setattr(svc_module, "STAGE_1_FETCH_TIMEOUT_SECONDS", 0.05) + + dataset_id = "DSX" + async with respx.mock( + base_url="https://api.example.test/v1", assert_all_called=False, + ) as router: + # Dataset record returns fast with numberOfSubjects + documentCount + # populated — the FALLBACK signals stage 2 should run. + router.get(f"/datasets/{dataset_id}").respond( + 200, + json=_dataset_raw( + _id=dataset_id, numberOfSubjects=2, documentCount=12, + ), + ) + + # Counts endpoint is slow → triggers stage-1 timeout. + async def _slow_counts( + request: httpx.Request, route: Any, + ) -> httpx.Response: + await _asyncio.sleep(0.5) + return httpx.Response(200, json=_counts_raw(subject=99)) + + router.get( + f"/datasets/{dataset_id}/document-class-counts", + ).mock(side_effect=_slow_counts) + + # Stage-2 ndiqueries succeed (fast) and bulk-fetch returns the + # openminds species fixture. Demonstrates that the fallback + # gate kept stage 2 running and we got real per-class data + # despite the counts timeout. + router.post("/ndiquery").respond( + 200, json=_ndiquery_body_for(["om-sp"]), + ) + router.post(f"/datasets/{dataset_id}/documents/bulk-fetch").respond( + 200, json=_bulk_fetch_body([HALEY_SPECIES]), + ) + + summary_svc = DatasetSummaryService(cloud, ontology_service) + summary = await summary_svc.build_summary(dataset_id, session=None) + + # Counts came from the record fallback (subjects from + # numberOfSubjects, totalDocuments from documentCount): + assert summary.counts.subjects == 2 + assert summary.counts.totalDocuments == 12 + # Stage 2 RAN despite the counts timeout — species was extracted + # from the openminds_subject ndiquery+bulk-fetch path. This is + # the win this PR introduces. + assert summary.species is not None + assert len(summary.species) > 0 + assert summary.species[0].ontologyId == "NCBITaxon:6239" + # Counts-timeout warning is still surfaced for operator awareness. + warnings_text = "\n".join(summary.extractionWarnings) + assert "class counts query failed" in warnings_text + + +def test_safe_record_int_handles_all_input_shapes() -> None: + """Pin the helper that powers stage-2 record-fallback gating: it + must accept any input shape and return a non-negative int (0 on + anything other than a positive integer field value). + """ + from backend.services.dataset_summary_service import _safe_record_int + + # Happy path: + assert _safe_record_int({"numberOfSubjects": 42}, "numberOfSubjects") == 42 + # Field present but null (Sophie Griswold's record): + assert _safe_record_int({"numberOfSubjects": None}, "numberOfSubjects") == 0 + # Field missing entirely: + assert _safe_record_int({}, "numberOfSubjects") == 0 + # Wrong type (string): + assert _safe_record_int({"numberOfSubjects": "1656"}, "numberOfSubjects") == 0 + # Negative value (defensive): + assert _safe_record_int({"numberOfSubjects": -1}, "numberOfSubjects") == 0 + # Non-dict input (e.g. dataset_raw was an exception, not a dict): + assert _safe_record_int(None, "numberOfSubjects") == 0 + assert _safe_record_int("garbage", "numberOfSubjects") == 0 + + @pytest.mark.asyncio async def test_stage_1_dataset_timeout_does_not_block_summary( cloud: NdiCloudClient,