From b39c4229a692eb0a6d53471d91786a660ecf916f Mon Sep 17 00:00:00 2001
From: audriB <audri@walthamdatascience.com>
Date: Mon, 27 Apr 2026 11:31:21 -0400
Subject: [PATCH] perf(summary): use dataset-record fields when
 /document-class-counts times out
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke-test pass after PRs #101/#102/#103 found that for the largest
production datasets (Jess Haley 78k docs, Sophie Griswold 101k docs)
the /document-class-counts endpoint still exhausts its 20s deadline
on every synthesis attempt — the cloud's Mongo aggregation on
className without proper indexes simply doesn't fit in the budget.

Net effect: every cron warm cycle landed a DEGRADED summary with all
zero counts AND null per-class facts (because counts.subjects=0
short-circuited stage 2's openminds_subject fanout). The
DatasetSummaryCard rendered "0 sessions, 0 subjects, ..., Not
applicable" everywhere, even though the dataset record itself
reports `numberOfSubjects=1656` and `documentCount=78687`.

The frontend band-aid (Waltham-Data-Science/ndi-cloud-app#91) hides
this UX-side by enriching the degraded summary with record fields
client-side. This PR is the real fix at the BACKEND: when stage-1
counts times out, synthesize a counts envelope from the dataset
record's `numberOfSubjects` + `documentCount` and gate stage 2 from
those record fields directly. Stage 2 still attempts the per-class
fanouts (openminds_subject for species/strains/sexes,
probe_location for brainRegions, element for probeTypes), bounded
by their existing 25s per-class deadlines.

## Worst-case timing

  stage 1: 20s  (counts + dataset record in parallel — dataset
                 succeeds fast; counts hits the deadline)
  stage 2: 25s  (3 classes in parallel — each bounded by per-class
                 deadline; ndiquery for openminds_subject can succeed
                 even when class-counts is slow because they hit
                 different cloud endpoints)
  ontology: ~2s
  total:    ~47s   (well under Railway's 88s ceiling)

## What the user sees, before vs after

  Before:
    counts: { sessions: 0, subjects: 0, probes: 0, elements: 0,
              epochs: 0, totalDocuments: 0 }
    species: null
    brainRegions: null
    extractionWarnings: ["class counts query failed: ..."]

  After (Jess Haley):
    counts: { sessions: 0, subjects: 1656, probes: 0, elements: 0,
              epochs: 0, totalDocuments: 78687 }
    species: [Caenorhabditis elegans, Escherichia coli]  ← FROM cloud
    brainRegions: [whole nervous system]                  ← FROM cloud
    strains/sexes: real data when available
    extractionWarnings: ["class counts query failed: ..."]
    ↑ subjects + totalDocuments come from dataset record;
      species/brainRegions/strains/sexes come from REAL stage-2
      ndiquery+bulk_fetch (the per-class queries succeed in
      isolation; only /document-class-counts is pathologically slow).

## Caveat: per-class counts (sessions/probes/elements/epochs) stay 0

The dataset record doesn't expose these fields, so when stage-1
counts times out we can't populate them. They display as 0 with the
"X warnings" tooltip explaining the underlying cause.

A future iteration could optionally compute these from
`/dataset/:id/documents?class=...&pageSize=1` reading the response
envelope's `total` field, but that's 4 extra cloud calls per
degraded synthesis — not worth it for fields the user rarely
notices vs the now-restored species/brainRegions facts.

## Coverage

  - test_stage_1_counts_timeout_still_runs_stage_2_via_record_fields:
    end-to-end pin asserting that with counts timed out + record
    fields populated, stage 2 attempts and succeeds, species comes
    back populated.
  - test_safe_record_int_handles_all_input_shapes: defensive helper
    accepts any input shape (None, dict-with-null, dict-with-string,
    negative int, missing key) and degrades to 0.

## Pairs with frontend

Combined with ndi-cloud-app#91 (record-fallback enrichment) +
ndi-cloud-app#92 (progressive document loading), the user-perceived
loading experience for large datasets is now:

  • Hero band renders instantly (raw record fields)
  • Summary card renders with real subjects + totalDocuments + species
    + brainRegions within ~25-45s of cold synthesis
  • Document Explorer shows first 50 rows immediately, more as user
    scrolls
  • Subsequent viewers within 24h get sub-second cache-hit responses
    (PR #103 differential TTL holds full successes longer)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/services/dataset_summary_service.py   |  92 +++++++++++++--
 .../unit/test_dataset_summary_service.py      | 106 ++++++++++++++++++
 2 files changed, 188 insertions(+), 10 deletions(-)

diff --git a/backend/services/dataset_summary_service.py b/backend/services/dataset_summary_service.py
index 72a17be..27abc0d 100644
--- a/backend/services/dataset_summary_service.py
+++ b/backend/services/dataset_summary_service.py
@@ -423,16 +423,52 @@ async def _build(
                 f"class counts query failed: counts fetch exceeded "
                 f"{STAGE_1_FETCH_TIMEOUT_SECONDS}s",
             )
-            # Zero-counts fallback: stage 2 sees `subjects_present=False`
-            # and `probe_present=False`, so the per-class fetches all
-            # short-circuit to `_empty_list()` and stage 2 finishes in
-            # microseconds. Net effect: a degraded summary with all
-            # facts None / 0 + two extraction warnings (counts + the
-            # implicit "no data so we didn't fan out" state).
+            # Synthesize a counts envelope from the dataset record's
+            # raw fields (``numberOfSubjects``, ``documentCount``)
+            # when the cloud's `/document-class-counts` endpoint
+            # times out. This is critical for large datasets (101k+
+            # docs) where /document-class-counts is THE perf bottleneck:
+            # without this fallback, ``subjects_present`` defaults to
+            # False, stage 2 short-circuits, and the user sees a
+            # degraded summary even though the dataset record itself
+            # has the subject count right there.
+            #
+            # The classCounts blob only includes ``subject`` (mapped
+            # from ``numberOfSubjects``) — the cloud's class-counts
+            # endpoint is the ONLY canonical source for per-class
+            # buckets like sessions/probes/elements/epochs, and we
+            # don't have those on the dataset record. Setting them to
+            # 0 keeps `_counts_from_raw` happy; their absence is
+            # already documented via the warning so an operator sees
+            # the cause-and-effect.
+            #
+            # `probe_present` upstream uses `counts.probes > 0 or
+            # counts.elements > 0` — neither is exposed on the
+            # dataset record. We bridge that by also signaling probe
+            # presence whenever ``documentCount > 0``: if the dataset
+            # has any documents at all, it's worth attempting the
+            # probe_location + element fanout. Worst case the
+            # ndiquery there times out at the per-class deadline and
+            # we fall back to None for those facts. Best case we land
+            # full data despite the counts timeout.
+            record_subjects = _safe_record_int(dataset_raw, "numberOfSubjects")
+            record_docs = _safe_record_int(dataset_raw, "documentCount")
             counts_raw = {
                 "datasetId": dataset_id,
-                "totalDocuments": 0,
-                "classCounts": {},
+                # Subjects + totalDocuments come straight from the
+                # dataset record. The per-class buckets stay empty —
+                # we don't have authoritative numbers for those. The
+                # later stage-2 gating reads `dataset_raw` directly
+                # via `_record_says_has_data()` to decide whether to
+                # attempt the probe_location + element ndiqueries
+                # despite the empty `probes`/`elements` counts (we
+                # don't want to fake those numbers — keeping the
+                # honest 0 means the UI doesn't claim per-class
+                # counts we never measured).
+                "totalDocuments": record_docs,
+                "classCounts": (
+                    {"subject": record_subjects} if record_subjects > 0 else {}
+                ),
             }
         else:
             # Mypy narrows the union via the isinstance check above.
@@ -445,8 +481,25 @@ async def _build(
         # brainRegions; element (primary=probe-like) → probeTypes. These all
         # share the dataset scope so we parallelize via asyncio.gather with
         # a shared semaphore bounding bulk-fetch concurrency.
-        subjects_present = counts.subjects > 0
-        probe_present = counts.probes > 0 or counts.elements > 0
+        #
+        # Gating: prefer the canonical counts (from /document-class-counts)
+        # when available; fall back to the dataset record's raw fields
+        # (``numberOfSubjects``, ``documentCount``) when stage-1 counts
+        # timed out. Without this fallback, every counts-timeout dataset
+        # would short-circuit stage 2 even when the record itself
+        # confirms subjects/documents exist — the user-perceived bug
+        # was Jess Haley's species/brainRegions rendering as `null` on
+        # the Summary card despite the hero band right above showing
+        # "78,687 documents · 1,656 subjects · Caenorhabditis elegans".
+        subjects_present = (
+            counts.subjects > 0
+            or _safe_record_int(dataset_raw, "numberOfSubjects") > 0
+        )
+        probe_present = (
+            counts.probes > 0
+            or counts.elements > 0
+            or _safe_record_int(dataset_raw, "documentCount") > 0
+        )
 
         if subjects_present:
             om_task = self._fetch_class_bounded(
@@ -713,6 +766,25 @@ def _result_or_warn(
     return cast(list[dict[str, Any]], result)
 
 
+def _safe_record_int(raw: Any, field: str) -> int:
+    """Read an integer field off a dataset-record dict, returning 0 on
+    any of: not-a-dict, missing field, non-int value, or negative.
+
+    Used by the stage-1 counts-timeout fallback path so a missing
+    ``numberOfSubjects`` (e.g. Sophie Griswold's record where the cloud
+    didn't populate it) cleanly degrades to 0 instead of a TypeError.
+    The 0 case still lets the Summary card render — it just leaves
+    the corresponding counts column at 0, matching what the cloud's
+    own /datasets/:id response says.
+    """
+    if not isinstance(raw, dict):
+        return 0
+    value = raw.get(field)
+    if isinstance(value, int) and value >= 0:
+        return value
+    return 0
+
+
 def _counts_from_raw(raw: dict[str, Any]) -> DatasetSummaryCounts:
     """``/document-class-counts`` returns ``{datasetId, totalDocuments,
     classCounts: {class_name: n}}``. We map the canonical classes; any
diff --git a/backend/tests/unit/test_dataset_summary_service.py b/backend/tests/unit/test_dataset_summary_service.py
index 2c96fe2..f96816a 100644
--- a/backend/tests/unit/test_dataset_summary_service.py
+++ b/backend/tests/unit/test_dataset_summary_service.py
@@ -713,6 +713,112 @@ async def _slow_counts(
     assert "exceeded 0.05s" in warnings_text
 
 
+# ---------------------------------------------------------------------------
+# Record-fallback gating for stage-2 fanout
+#
+# Smoke-test follow-up: when stage-1 /document-class-counts times out,
+# stage 2 was short-circuiting entirely (subjects_present=False,
+# probe_present=False) — even when the dataset record itself reports
+# `numberOfSubjects > 0` and `documentCount > 0`. Net effect: a degraded
+# summary with null per-class facts despite the dataset record having
+# enough information to gate the fanout. Fix uses record fields as
+# fallback when counts is degraded so stage 2 still attempts the
+# openminds_subject + probe_location + element ndiqueries.
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_stage_1_counts_timeout_still_runs_stage_2_via_record_fields(
+    cloud: NdiCloudClient,
+    ontology_service: OntologyService,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When `/document-class-counts` times out but the dataset record
+    has `numberOfSubjects > 0`, stage 2 must still attempt the
+    openminds_subject + probe_location + element fanout. Without this
+    fallback the summary degrades to null facts even on datasets where
+    the record-level signal is clear.
+    """
+    import asyncio as _asyncio
+
+    import backend.services.dataset_summary_service as svc_module
+
+    monkeypatch.setattr(svc_module, "STAGE_1_FETCH_TIMEOUT_SECONDS", 0.05)
+
+    dataset_id = "DSX"
+    async with respx.mock(
+        base_url="https://api.example.test/v1", assert_all_called=False,
+    ) as router:
+        # Dataset record returns fast with numberOfSubjects + documentCount
+        # populated — the FALLBACK signals stage 2 should run.
+        router.get(f"/datasets/{dataset_id}").respond(
+            200,
+            json=_dataset_raw(
+                _id=dataset_id, numberOfSubjects=2, documentCount=12,
+            ),
+        )
+
+        # Counts endpoint is slow → triggers stage-1 timeout.
+        async def _slow_counts(
+            request: httpx.Request, route: Any,
+        ) -> httpx.Response:
+            await _asyncio.sleep(0.5)
+            return httpx.Response(200, json=_counts_raw(subject=99))
+
+        router.get(
+            f"/datasets/{dataset_id}/document-class-counts",
+        ).mock(side_effect=_slow_counts)
+
+        # Stage-2 ndiqueries succeed (fast) and bulk-fetch returns the
+        # openminds species fixture. Demonstrates that the fallback
+        # gate kept stage 2 running and we got real per-class data
+        # despite the counts timeout.
+        router.post("/ndiquery").respond(
+            200, json=_ndiquery_body_for(["om-sp"]),
+        )
+        router.post(f"/datasets/{dataset_id}/documents/bulk-fetch").respond(
+            200, json=_bulk_fetch_body([HALEY_SPECIES]),
+        )
+
+        summary_svc = DatasetSummaryService(cloud, ontology_service)
+        summary = await summary_svc.build_summary(dataset_id, session=None)
+
+    # Counts came from the record fallback (subjects from
+    # numberOfSubjects, totalDocuments from documentCount):
+    assert summary.counts.subjects == 2
+    assert summary.counts.totalDocuments == 12
+    # Stage 2 RAN despite the counts timeout — species was extracted
+    # from the openminds_subject ndiquery+bulk-fetch path. This is
+    # the win this PR introduces.
+    assert summary.species is not None
+    assert len(summary.species) > 0
+    assert summary.species[0].ontologyId == "NCBITaxon:6239"
+    # Counts-timeout warning is still surfaced for operator awareness.
+    warnings_text = "\n".join(summary.extractionWarnings)
+    assert "class counts query failed" in warnings_text
+
+
+def test_safe_record_int_handles_all_input_shapes() -> None:
+    """Pin the helper that powers stage-2 record-fallback gating: it
+    must accept any input shape and return a non-negative int (0 on
+    anything other than a positive integer field value).
+    """
+    from backend.services.dataset_summary_service import _safe_record_int
+
+    # Happy path:
+    assert _safe_record_int({"numberOfSubjects": 42}, "numberOfSubjects") == 42
+    # Field present but null (Sophie Griswold's record):
+    assert _safe_record_int({"numberOfSubjects": None}, "numberOfSubjects") == 0
+    # Field missing entirely:
+    assert _safe_record_int({}, "numberOfSubjects") == 0
+    # Wrong type (string):
+    assert _safe_record_int({"numberOfSubjects": "1656"}, "numberOfSubjects") == 0
+    # Negative value (defensive):
+    assert _safe_record_int({"numberOfSubjects": -1}, "numberOfSubjects") == 0
+    # Non-dict input (e.g. dataset_raw was an exception, not a dict):
+    assert _safe_record_int(None, "numberOfSubjects") == 0
+    assert _safe_record_int("garbage", "numberOfSubjects") == 0
+
+
 @pytest.mark.asyncio
 async def test_stage_1_dataset_timeout_does_not_block_summary(
     cloud: NdiCloudClient,