Waltham-Data-Science · audriB · Apr 27, 2026
diff --git a/backend/services/dataset_summary_service.py b/backend/services/dataset_summary_service.py
@@ -423,16 +423,52 @@ async def _build(
                 f"class counts query failed: counts fetch exceeded "
                 f"{STAGE_1_FETCH_TIMEOUT_SECONDS}s",
             )
-            # Zero-counts fallback: stage 2 sees `subjects_present=False`
-            # and `probe_present=False`, so the per-class fetches all
-            # short-circuit to `_empty_list()` and stage 2 finishes in
-            # microseconds. Net effect: a degraded summary with all
-            # facts None / 0 + two extraction warnings (counts + the
-            # implicit "no data so we didn't fan out" state).
+            # Synthesize a counts envelope from the dataset record's
+            # raw fields (``numberOfSubjects``, ``documentCount``)
+            # when the cloud's `/document-class-counts` endpoint
+            # times out. This is critical for large datasets (101k+
+            # docs) where /document-class-counts is THE perf bottleneck:
+            # without this fallback, ``subjects_present`` defaults to
+            # False, stage 2 short-circuits, and the user sees a
+            # degraded summary even though the dataset record itself
+            # has the subject count right there.
+            #
+            # The classCounts blob only includes ``subject`` (mapped
+            # from ``numberOfSubjects``) — the cloud's class-counts
+            # endpoint is the ONLY canonical source for per-class
+            # buckets like sessions/probes/elements/epochs, and we
+            # don't have those on the dataset record. Setting them to
+            # 0 keeps `_counts_from_raw` happy; their absence is
+            # already documented via the warning so an operator sees
+            # the cause-and-effect.
+            #
+            # `probe_present` upstream uses `counts.probes > 0 or
+            # counts.elements > 0` — neither is exposed on the
+            # dataset record. We bridge that by also signaling probe
+            # presence whenever ``documentCount > 0``: if the dataset
+            # has any documents at all, it's worth attempting the
+            # probe_location + element fanout. Worst case the
+            # ndiquery there times out at the per-class deadline and
+            # we fall back to None for those facts. Best case we land
+            # full data despite the counts timeout.
+            record_subjects = _safe_record_int(dataset_raw, "numberOfSubjects")
+            record_docs = _safe_record_int(dataset_raw, "documentCount")
             counts_raw = {
                 "datasetId": dataset_id,
-                "totalDocuments": 0,
-                "classCounts": {},
+                # Subjects + totalDocuments come straight from the
+                # dataset record. The per-class buckets stay empty —
+                # we don't have authoritative numbers for those. The
+                # later stage-2 gating reads `dataset_raw` directly
+                # via `_record_says_has_data()` to decide whether to
+                # attempt the probe_location + element ndiqueries
+                # despite the empty `probes`/`elements` counts (we
+                # don't want to fake those numbers — keeping the
+                # honest 0 means the UI doesn't claim per-class
+                # counts we never measured).
+                "totalDocuments": record_docs,
+                "classCounts": (
+                    {"subject": record_subjects} if record_subjects > 0 else {}
+                ),
             }
         else:
             # Mypy narrows the union via the isinstance check above.
@@ -445,8 +481,25 @@ async def _build(
         # brainRegions; element (primary=probe-like) → probeTypes. These all
         # share the dataset scope so we parallelize via asyncio.gather with
         # a shared semaphore bounding bulk-fetch concurrency.
-        subjects_present = counts.subjects > 0
-        probe_present = counts.probes > 0 or counts.elements > 0
+        #
+        # Gating: prefer the canonical counts (from /document-class-counts)
+        # when available; fall back to the dataset record's raw fields
+        # (``numberOfSubjects``, ``documentCount``) when stage-1 counts
+        # timed out. Without this fallback, every counts-timeout dataset
+        # would short-circuit stage 2 even when the record itself
+        # confirms subjects/documents exist — the user-perceived bug
+        # was Jess Haley's species/brainRegions rendering as `null` on
+        # the Summary card despite the hero band right above showing
+        # "78,687 documents · 1,656 subjects · Caenorhabditis elegans".
+        subjects_present = (
+            counts.subjects > 0
+            or _safe_record_int(dataset_raw, "numberOfSubjects") > 0
+        )
+        probe_present = (
+            counts.probes > 0
+            or counts.elements > 0
+            or _safe_record_int(dataset_raw, "documentCount") > 0
+        )
 
         if subjects_present:
             om_task = self._fetch_class_bounded(
@@ -713,6 +766,25 @@ def _result_or_warn(
     return cast(list[dict[str, Any]], result)
 
 
+def _safe_record_int(raw: Any, field: str) -> int:
+    """Read an integer field off a dataset-record dict, returning 0 on
+    any of: not-a-dict, missing field, non-int value, or negative.
+
+    Used by the stage-1 counts-timeout fallback path so a missing
+    ``numberOfSubjects`` (e.g. Sophie Griswold's record where the cloud
+    didn't populate it) cleanly degrades to 0 instead of a TypeError.
+    The 0 case still lets the Summary card render — it just leaves
+    the corresponding counts column at 0, matching what the cloud's
+    own /datasets/:id response says.
+    """
+    if not isinstance(raw, dict):
+        return 0
+    value = raw.get(field)
+    if isinstance(value, int) and value >= 0:
+        return value
+    return 0
+
+
 def _counts_from_raw(raw: dict[str, Any]) -> DatasetSummaryCounts:
     """``/document-class-counts`` returns ``{datasetId, totalDocuments,
     classCounts: {class_name: n}}``. We map the canonical classes; any

diff --git a/backend/tests/unit/test_dataset_summary_service.py b/backend/tests/unit/test_dataset_summary_service.py
@@ -713,6 +713,112 @@ async def _slow_counts(
     assert "exceeded 0.05s" in warnings_text
 
 
+# ---------------------------------------------------------------------------
+# Record-fallback gating for stage-2 fanout
+#
+# Smoke-test follow-up: when stage-1 /document-class-counts times out,
+# stage 2 was short-circuiting entirely (subjects_present=False,
+# probe_present=False) — even when the dataset record itself reports
+# `numberOfSubjects > 0` and `documentCount > 0`. Net effect: a degraded
+# summary with null per-class facts despite the dataset record having
+# enough information to gate the fanout. Fix uses record fields as
+# fallback when counts is degraded so stage 2 still attempts the
+# openminds_subject + probe_location + element ndiqueries.
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_stage_1_counts_timeout_still_runs_stage_2_via_record_fields(
+    cloud: NdiCloudClient,
+    ontology_service: OntologyService,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When `/document-class-counts` times out but the dataset record
+    has `numberOfSubjects > 0`, stage 2 must still attempt the
+    openminds_subject + probe_location + element fanout. Without this
+    fallback the summary degrades to null facts even on datasets where
+    the record-level signal is clear.
+    """
+    import asyncio as _asyncio
+
+    import backend.services.dataset_summary_service as svc_module
+
+    monkeypatch.setattr(svc_module, "STAGE_1_FETCH_TIMEOUT_SECONDS", 0.05)
+
+    dataset_id = "DSX"
+    async with respx.mock(
+        base_url="https://api.example.test/v1", assert_all_called=False,
+    ) as router:
+        # Dataset record returns fast with numberOfSubjects + documentCount
+        # populated — the FALLBACK signals stage 2 should run.
+        router.get(f"/datasets/{dataset_id}").respond(
+            200,
+            json=_dataset_raw(
+                _id=dataset_id, numberOfSubjects=2, documentCount=12,
+            ),
+        )
+
+        # Counts endpoint is slow → triggers stage-1 timeout.
+        async def _slow_counts(
+            request: httpx.Request, route: Any,
+        ) -> httpx.Response:
+            await _asyncio.sleep(0.5)
+            return httpx.Response(200, json=_counts_raw(subject=99))
+
+        router.get(
+            f"/datasets/{dataset_id}/document-class-counts",
+        ).mock(side_effect=_slow_counts)
+
+        # Stage-2 ndiqueries succeed (fast) and bulk-fetch returns the
+        # openminds species fixture. Demonstrates that the fallback
+        # gate kept stage 2 running and we got real per-class data
+        # despite the counts timeout.
+        router.post("/ndiquery").respond(
+            200, json=_ndiquery_body_for(["om-sp"]),
+        )
+        router.post(f"/datasets/{dataset_id}/documents/bulk-fetch").respond(
+            200, json=_bulk_fetch_body([HALEY_SPECIES]),
+        )
+
+        summary_svc = DatasetSummaryService(cloud, ontology_service)
+        summary = await summary_svc.build_summary(dataset_id, session=None)
+
+    # Counts came from the record fallback (subjects from
+    # numberOfSubjects, totalDocuments from documentCount):
+    assert summary.counts.subjects == 2
+    assert summary.counts.totalDocuments == 12
+    # Stage 2 RAN despite the counts timeout — species was extracted
+    # from the openminds_subject ndiquery+bulk-fetch path. This is
+    # the win this PR introduces.
+    assert summary.species is not None
+    assert len(summary.species) > 0
+    assert summary.species[0].ontologyId == "NCBITaxon:6239"
+    # Counts-timeout warning is still surfaced for operator awareness.
+    warnings_text = "\n".join(summary.extractionWarnings)
+    assert "class counts query failed" in warnings_text
+
+
+def test_safe_record_int_handles_all_input_shapes() -> None:
+    """Pin the helper that powers stage-2 record-fallback gating: it
+    must accept any input shape and return a non-negative int (0 on
+    anything other than a positive integer field value).
+    """
+    from backend.services.dataset_summary_service import _safe_record_int
+
+    # Happy path:
+    assert _safe_record_int({"numberOfSubjects": 42}, "numberOfSubjects") == 42
+    # Field present but null (Sophie Griswold's record):
+    assert _safe_record_int({"numberOfSubjects": None}, "numberOfSubjects") == 0
+    # Field missing entirely:
+    assert _safe_record_int({}, "numberOfSubjects") == 0
+    # Wrong type (string):
+    assert _safe_record_int({"numberOfSubjects": "1656"}, "numberOfSubjects") == 0
+    # Negative value (defensive):
+    assert _safe_record_int({"numberOfSubjects": -1}, "numberOfSubjects") == 0
+    # Non-dict input (e.g. dataset_raw was an exception, not a dict):
+    assert _safe_record_int(None, "numberOfSubjects") == 0
+    assert _safe_record_int("garbage", "numberOfSubjects") == 0
+
+
 @pytest.mark.asyncio
 async def test_stage_1_dataset_timeout_does_not_block_summary(
     cloud: NdiCloudClient,