Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 82 additions & 10 deletions backend/services/dataset_summary_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,16 +423,52 @@ async def _build(
f"class counts query failed: counts fetch exceeded "
f"{STAGE_1_FETCH_TIMEOUT_SECONDS}s",
)
# Zero-counts fallback: stage 2 sees `subjects_present=False`
# and `probe_present=False`, so the per-class fetches all
# short-circuit to `_empty_list()` and stage 2 finishes in
# microseconds. Net effect: a degraded summary with all
# facts None / 0 + two extraction warnings (counts + the
# implicit "no data so we didn't fan out" state).
# Synthesize a counts envelope from the dataset record's
# raw fields (``numberOfSubjects``, ``documentCount``)
# when the cloud's `/document-class-counts` endpoint
# times out. This is critical for large datasets (101k+
# docs) where /document-class-counts is THE perf bottleneck:
# without this fallback, ``subjects_present`` defaults to
# False, stage 2 short-circuits, and the user sees a
# degraded summary even though the dataset record itself
# has the subject count right there.
#
# The classCounts blob only includes ``subject`` (mapped
# from ``numberOfSubjects``) — the cloud's class-counts
# endpoint is the ONLY canonical source for per-class
# buckets like sessions/probes/elements/epochs, and we
# don't have those on the dataset record. Setting them to
# 0 keeps `_counts_from_raw` happy; their absence is
# already documented via the warning so an operator sees
# the cause-and-effect.
#
# `probe_present` upstream uses `counts.probes > 0 or
# counts.elements > 0` — neither is exposed on the
# dataset record. We bridge that by also signaling probe
# presence whenever ``documentCount > 0``: if the dataset
# has any documents at all, it's worth attempting the
# probe_location + element fanout. Worst case the
# ndiquery there times out at the per-class deadline and
# we fall back to None for those facts. Best case we land
# full data despite the counts timeout.
record_subjects = _safe_record_int(dataset_raw, "numberOfSubjects")
record_docs = _safe_record_int(dataset_raw, "documentCount")
counts_raw = {
"datasetId": dataset_id,
"totalDocuments": 0,
"classCounts": {},
# Subjects + totalDocuments come straight from the
# dataset record. The per-class buckets stay empty —
# we don't have authoritative numbers for those. The
# later stage-2 gating reads `dataset_raw` directly
# via `_record_says_has_data()` to decide whether to
# attempt the probe_location + element ndiqueries
# despite the empty `probes`/`elements` counts (we
# don't want to fake those numbers — keeping the
# honest 0 means the UI doesn't claim per-class
# counts we never measured).
"totalDocuments": record_docs,
"classCounts": (
{"subject": record_subjects} if record_subjects > 0 else {}
),
}
else:
# Mypy narrows the union via the isinstance check above.
Expand All @@ -445,8 +481,25 @@ async def _build(
# brainRegions; element (primary=probe-like) → probeTypes. These all
# share the dataset scope so we parallelize via asyncio.gather with
# a shared semaphore bounding bulk-fetch concurrency.
subjects_present = counts.subjects > 0
probe_present = counts.probes > 0 or counts.elements > 0
#
# Gating: prefer the canonical counts (from /document-class-counts)
# when available; fall back to the dataset record's raw fields
# (``numberOfSubjects``, ``documentCount``) when stage-1 counts
# timed out. Without this fallback, every counts-timeout dataset
# would short-circuit stage 2 even when the record itself
# confirms subjects/documents exist — the user-perceived bug
# was Jess Haley's species/brainRegions rendering as `null` on
# the Summary card despite the hero band right above showing
# "78,687 documents · 1,656 subjects · Caenorhabditis elegans".
subjects_present = (
counts.subjects > 0
or _safe_record_int(dataset_raw, "numberOfSubjects") > 0
)
probe_present = (
counts.probes > 0
or counts.elements > 0
or _safe_record_int(dataset_raw, "documentCount") > 0
)

if subjects_present:
om_task = self._fetch_class_bounded(
Expand Down Expand Up @@ -713,6 +766,25 @@ def _result_or_warn(
return cast(list[dict[str, Any]], result)


def _safe_record_int(raw: Any, field: str) -> int:
"""Read an integer field off a dataset-record dict, returning 0 on
any of: not-a-dict, missing field, non-int value, or negative.

Used by the stage-1 counts-timeout fallback path so a missing
``numberOfSubjects`` (e.g. Sophie Griswold's record where the cloud
didn't populate it) cleanly degrades to 0 instead of a TypeError.
The 0 case still lets the Summary card render — it just leaves
the corresponding counts column at 0, matching what the cloud's
own /datasets/:id response says.
"""
if not isinstance(raw, dict):
return 0
value = raw.get(field)
if isinstance(value, int) and value >= 0:
return value
return 0


def _counts_from_raw(raw: dict[str, Any]) -> DatasetSummaryCounts:
"""``/document-class-counts`` returns ``{datasetId, totalDocuments,
classCounts: {class_name: n}}``. We map the canonical classes; any
Expand Down
106 changes: 106 additions & 0 deletions backend/tests/unit/test_dataset_summary_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,112 @@ async def _slow_counts(
assert "exceeded 0.05s" in warnings_text


# ---------------------------------------------------------------------------
# Record-fallback gating for stage-2 fanout
#
# Smoke-test follow-up: when stage-1 /document-class-counts times out,
# stage 2 was short-circuiting entirely (subjects_present=False,
# probe_present=False) — even when the dataset record itself reports
# `numberOfSubjects > 0` and `documentCount > 0`. Net effect: a degraded
# summary with null per-class facts despite the dataset record having
# enough information to gate the fanout. Fix uses record fields as
# fallback when counts is degraded so stage 2 still attempts the
# openminds_subject + probe_location + element ndiqueries.
# ---------------------------------------------------------------------------

@pytest.mark.asyncio
async def test_stage_1_counts_timeout_still_runs_stage_2_via_record_fields(
cloud: NdiCloudClient,
ontology_service: OntologyService,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When `/document-class-counts` times out but the dataset record
has `numberOfSubjects > 0`, stage 2 must still attempt the
openminds_subject + probe_location + element fanout. Without this
fallback the summary degrades to null facts even on datasets where
the record-level signal is clear.
"""
import asyncio as _asyncio

import backend.services.dataset_summary_service as svc_module

monkeypatch.setattr(svc_module, "STAGE_1_FETCH_TIMEOUT_SECONDS", 0.05)

dataset_id = "DSX"
async with respx.mock(
base_url="https://api.example.test/v1", assert_all_called=False,
) as router:
# Dataset record returns fast with numberOfSubjects + documentCount
# populated — the FALLBACK signals stage 2 should run.
router.get(f"/datasets/{dataset_id}").respond(
200,
json=_dataset_raw(
_id=dataset_id, numberOfSubjects=2, documentCount=12,
),
)

# Counts endpoint is slow → triggers stage-1 timeout.
async def _slow_counts(
request: httpx.Request, route: Any,
) -> httpx.Response:
await _asyncio.sleep(0.5)
return httpx.Response(200, json=_counts_raw(subject=99))

router.get(
f"/datasets/{dataset_id}/document-class-counts",
).mock(side_effect=_slow_counts)

# Stage-2 ndiqueries succeed (fast) and bulk-fetch returns the
# openminds species fixture. Demonstrates that the fallback
# gate kept stage 2 running and we got real per-class data
# despite the counts timeout.
router.post("/ndiquery").respond(
200, json=_ndiquery_body_for(["om-sp"]),
)
router.post(f"/datasets/{dataset_id}/documents/bulk-fetch").respond(
200, json=_bulk_fetch_body([HALEY_SPECIES]),
)

summary_svc = DatasetSummaryService(cloud, ontology_service)
summary = await summary_svc.build_summary(dataset_id, session=None)

# Counts came from the record fallback (subjects from
# numberOfSubjects, totalDocuments from documentCount):
assert summary.counts.subjects == 2
assert summary.counts.totalDocuments == 12
# Stage 2 RAN despite the counts timeout — species was extracted
# from the openminds_subject ndiquery+bulk-fetch path. This is
# the win this PR introduces.
assert summary.species is not None
assert len(summary.species) > 0
assert summary.species[0].ontologyId == "NCBITaxon:6239"
# Counts-timeout warning is still surfaced for operator awareness.
warnings_text = "\n".join(summary.extractionWarnings)
assert "class counts query failed" in warnings_text


def test_safe_record_int_handles_all_input_shapes() -> None:
"""Pin the helper that powers stage-2 record-fallback gating: it
must accept any input shape and return a non-negative int (0 on
anything other than a positive integer field value).
"""
from backend.services.dataset_summary_service import _safe_record_int

# Happy path:
assert _safe_record_int({"numberOfSubjects": 42}, "numberOfSubjects") == 42
# Field present but null (Sophie Griswold's record):
assert _safe_record_int({"numberOfSubjects": None}, "numberOfSubjects") == 0
# Field missing entirely:
assert _safe_record_int({}, "numberOfSubjects") == 0
# Wrong type (string):
assert _safe_record_int({"numberOfSubjects": "1656"}, "numberOfSubjects") == 0
# Negative value (defensive):
assert _safe_record_int({"numberOfSubjects": -1}, "numberOfSubjects") == 0
# Non-dict input (e.g. dataset_raw was an exception, not a dict):
assert _safe_record_int(None, "numberOfSubjects") == 0
assert _safe_record_int("garbage", "numberOfSubjects") == 0


@pytest.mark.asyncio
async def test_stage_1_dataset_timeout_does_not_block_summary(
cloud: NdiCloudClient,
Expand Down
Loading