Skip to content

Commit bba4b07

Browse files
committed
Add ndi.util.datasetSummary and compareDatasetSummary utilities
Extract dataset summary logic from symmetry tests into public ndi.util functions (mirroring MATLAB's ndi.util.datasetSummary and ndi.util.compareDatasetSummary). Simplify symmetry tests to use the new utilities instead of inline summary building and comparison. Add 14 unit tests covering both functions. https://claude.ai/code/session_01EctVW1VcbY2LzAdfZrGBB5
1 parent 44cd884 commit bba4b07

9 files changed

Lines changed: 581 additions & 175 deletions

File tree

src/ndi/util/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
"""
1515

1616
from .classname import ndi_matlab_classname, ndi_python_classname
17+
from .compare_dataset_summary import compareDatasetSummary
1718
from .compare_session_summary import compareSessionSummary
19+
from .dataset_summary import datasetSummary
1820
from .datestamp2datetime import datestamp2datetime
1921
from .downsampleTimeseries import downsampleTimeseries
2022
from .getHexDiffFromFileObj import getHexDiffFromFileObj
@@ -28,7 +30,9 @@
2830
__all__ = [
2931
"ndi_matlab_classname",
3032
"ndi_python_classname",
33+
"compareDatasetSummary",
3134
"compareSessionSummary",
35+
"datasetSummary",
3236
"datestamp2datetime",
3337
"downsampleTimeseries",
3438
"getHexDiffFromFileObj",
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""Compare two dataset summaries and return a report of differences.
2+
3+
MATLAB equivalent: ``ndi.util.compareDatasetSummary``
4+
5+
Compares two summary dicts (as produced by :func:`datasetSummary`) and
6+
returns a list of human-readable difference strings.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
from typing import Any
12+
13+
from .compare_session_summary import compareSessionSummary
14+
15+
16+
def compareDatasetSummary(
17+
summary1: dict[str, Any],
18+
summary2: dict[str, Any],
19+
*,
20+
excludeFiles: list[str] | None = None,
21+
excludeFields: list[str] | None = None,
22+
) -> list[str]:
23+
"""Compare two dataset summaries and return a report.
24+
25+
MATLAB equivalent: ``ndi.util.compareDatasetSummary(s1, s2, ...)``
26+
27+
Args:
28+
summary1: First dataset summary dict.
29+
summary2: Second dataset summary dict.
30+
excludeFiles: Filenames to ignore when comparing file lists
31+
within session summaries.
32+
excludeFields: Field names to skip entirely during comparison.
33+
34+
Returns:
35+
List of difference strings. Empty list means summaries match.
36+
"""
37+
if excludeFiles is None:
38+
excludeFiles = []
39+
if excludeFields is None:
40+
excludeFields = []
41+
42+
report: list[str] = []
43+
44+
# 1. Compare numSessions
45+
if "numSessions" not in excludeFields:
46+
n1 = summary1.get("numSessions", 0)
47+
n2 = summary2.get("numSessions", 0)
48+
if n1 != n2:
49+
report.append(
50+
f"numSessions differs: {n1} vs {n2}"
51+
)
52+
53+
# 2. Compare references
54+
if "references" not in excludeFields:
55+
refs1 = sorted(summary1.get("references", []))
56+
refs2 = sorted(summary2.get("references", []))
57+
if refs1 != refs2:
58+
report.append(
59+
f"references differ: {refs1} vs {refs2}"
60+
)
61+
62+
# 3. Compare sessionIds
63+
if "sessionIds" not in excludeFields:
64+
ids1 = sorted(summary1.get("sessionIds", []))
65+
ids2 = sorted(summary2.get("sessionIds", []))
66+
if ids1 != ids2:
67+
report.append(
68+
f"sessionIds differ: {ids1} vs {ids2}"
69+
)
70+
71+
# 4. Compare sessionSummaries
72+
if "sessionSummaries" not in excludeFields:
73+
ss1 = summary1.get("sessionSummaries", [])
74+
ss2 = summary2.get("sessionSummaries", [])
75+
76+
if len(ss1) != len(ss2):
77+
report.append(
78+
f"sessionSummaries count differs: {len(ss1)} vs {len(ss2)}"
79+
)
80+
else:
81+
# Match session summaries by sessionId when available,
82+
# otherwise compare by index order.
83+
ids1 = summary1.get("sessionIds", [])
84+
ids2 = summary2.get("sessionIds", [])
85+
86+
if len(ids1) == len(ss1) and len(ids2) == len(ss2):
87+
# Build lookup by sessionId for summary2
88+
lookup2: dict[str, dict] = {}
89+
for sid, ss in zip(ids2, ss2):
90+
lookup2[sid] = ss
91+
92+
for i, sid in enumerate(ids1):
93+
match = lookup2.get(sid)
94+
if match is None:
95+
report.append(
96+
f"sessionSummaries: session {sid} not found in summary2"
97+
)
98+
continue
99+
sub = compareSessionSummary(
100+
ss1[i],
101+
match,
102+
excludeFiles=excludeFiles,
103+
excludeFields=excludeFields,
104+
)
105+
for s in sub:
106+
report.append(f"sessionSummaries[{sid}]: {s}")
107+
else:
108+
# Fallback: compare by index
109+
for i, (s1, s2) in enumerate(zip(ss1, ss2)):
110+
sub = compareSessionSummary(
111+
s1,
112+
s2,
113+
excludeFiles=excludeFiles,
114+
excludeFields=excludeFields,
115+
)
116+
for s in sub:
117+
report.append(f"sessionSummaries[{i}]: {s}")
118+
119+
return report

src/ndi/util/dataset_summary.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""ndi_dataset summary utility for symmetry testing.
2+
3+
MATLAB equivalent: ``ndi.util.datasetSummary``
4+
5+
Creates a summary dict of an ``ndi.dataset.Dataset`` object containing key
6+
fields and properties, intended for symmetry testing between NDI language
7+
implementations.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
from typing import Any
13+
14+
from .session_summary import sessionSummary
15+
16+
17+
def datasetSummary(dataset_obj: Any) -> dict[str, Any]:
18+
"""Create a summary structure of an ndi.dataset.Dataset object.
19+
20+
MATLAB equivalent: ``ndi.util.datasetSummary(dataset_obj)``
21+
22+
Args:
23+
dataset_obj: An NDI Dataset object.
24+
25+
Returns:
26+
Dict with keys: numSessions, references, sessionIds,
27+
sessionSummaries.
28+
"""
29+
refs, session_ids, *_ = dataset_obj.session_list()
30+
31+
# Build a session summary for each session in the dataset
32+
session_summaries = []
33+
for sid in session_ids:
34+
sess = dataset_obj.open_session(sid)
35+
session_summaries.append(sessionSummary(sess))
36+
37+
return {
38+
"numSessions": len(refs),
39+
"references": refs,
40+
"sessionIds": session_ids,
41+
"sessionSummaries": session_summaries,
42+
}

src/ndi/util/ndi_matlab_python_bridge.yaml

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,89 @@ functions:
174174
Exact match. Replaces NDI sentinel strings for NaN, Infinity,
175175
and -Infinity in JSON text with Python-compatible representations.
176176
177+
- name: sessionSummary
178+
type: function
179+
matlab_path: "+ndi/+util/sessionSummary.m"
180+
python_path: "ndi/util/session_summary.py"
181+
input_arguments:
182+
- name: session_obj
183+
type_matlab: "ndi.session"
184+
type_python: "Any"
185+
output_arguments:
186+
- name: summary
187+
type_python: "dict[str, Any]"
188+
decision_log: >
189+
Exact match. Creates a summary dict of an ndi.session object
190+
for symmetry testing.
191+
192+
- name: compareSessionSummary
193+
type: function
194+
matlab_path: "+ndi/+util/compareSessionSummary.m"
195+
python_path: "ndi/util/compare_session_summary.py"
196+
input_arguments:
197+
- name: summary1
198+
type_matlab: "struct"
199+
type_python: "dict[str, Any]"
200+
- name: summary2
201+
type_matlab: "struct"
202+
type_python: "dict[str, Any]"
203+
- name: excludeFiles
204+
type_matlab: "cell array of char"
205+
type_python: "list[str] | None"
206+
default: "None"
207+
- name: excludeFields
208+
type_matlab: "cell array of char"
209+
type_python: "list[str] | None"
210+
default: "None"
211+
output_arguments:
212+
- name: report
213+
type_python: "list[str]"
214+
decision_log: >
215+
Exact match. Compares two session summaries and returns a list
216+
of difference strings. Empty list means summaries match.
217+
218+
- name: datasetSummary
219+
type: function
220+
matlab_path: "+ndi/+util/datasetSummary.m"
221+
python_path: "ndi/util/dataset_summary.py"
222+
input_arguments:
223+
- name: dataset_obj
224+
type_matlab: "ndi.dataset.Dataset"
225+
type_python: "Any"
226+
output_arguments:
227+
- name: summary
228+
type_python: "dict[str, Any]"
229+
decision_log: >
230+
Exact match. Creates a summary dict of an ndi.dataset.Dataset
231+
object for symmetry testing.
232+
233+
- name: compareDatasetSummary
234+
type: function
235+
matlab_path: "+ndi/+util/compareDatasetSummary.m"
236+
python_path: "ndi/util/compare_dataset_summary.py"
237+
input_arguments:
238+
- name: summary1
239+
type_matlab: "struct"
240+
type_python: "dict[str, Any]"
241+
- name: summary2
242+
type_matlab: "struct"
243+
type_python: "dict[str, Any]"
244+
- name: excludeFiles
245+
type_matlab: "cell array of char"
246+
type_python: "list[str] | None"
247+
default: "None"
248+
- name: excludeFields
249+
type_matlab: "cell array of char"
250+
type_python: "list[str] | None"
251+
default: "None"
252+
output_arguments:
253+
- name: report
254+
type_python: "list[str]"
255+
decision_log: >
256+
Exact match. Compares two dataset summaries and returns a list
257+
of difference strings. Delegates session-level comparison to
258+
compareSessionSummary.
259+
177260
- name: unwrapTableCellContent
178261
type: function
179262
matlab_path: "+ndi/+util/unwrapTableCellContent.m"

tests/symmetry/make_artifacts/dataset/test_build_dataset.py

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from ndi.document import Document
2525
from ndi.query import Query
2626
from ndi.session.dir import DirSession
27-
from ndi.util import sessionSummary
27+
from ndi.util import datasetSummary
2828
from tests.symmetry.conftest import PYTHON_ARTIFACTS
2929

3030
ARTIFACT_DIR = PYTHON_ARTIFACTS / "dataset" / "buildDataset" / "testBuildDatasetArtifacts"
@@ -46,29 +46,6 @@ def _add_doc_with_file(session: DirSession, doc_number: int) -> None:
4646
session.database_add(doc)
4747

4848

49-
def _dataset_summary(dataset: Dataset) -> dict:
50-
"""Create a summary structure for a dataset.
51-
52-
Mirrors MATLAB's ``ndi.symmetry.makeArtifacts.dataset.buildDataset``
53-
which writes: numSessions, references, sessionIds, sessionSummaries.
54-
"""
55-
refs, session_ids, *_ = dataset.session_list()
56-
num_sessions = len(refs)
57-
58-
# Build a session summary for each session in the dataset
59-
session_summaries = []
60-
for sid in session_ids:
61-
sess = dataset.open_session(sid)
62-
session_summaries.append(sessionSummary(sess))
63-
64-
return {
65-
"numSessions": num_sessions,
66-
"references": refs,
67-
"sessionIds": session_ids,
68-
"sessionSummaries": session_summaries,
69-
}
70-
71-
7249
class TestBuildDataset:
7350
"""Mirror of ndi.symmetry.makeArtifacts.dataset.buildDataset."""
7451

@@ -132,7 +109,7 @@ def test_build_dataset_artifacts(self):
132109

133110
# Write datasetSummary.json – open from artifact_dir so the session
134111
# path lists files that are actually present (including jsonDocuments).
135-
summary = _dataset_summary(artifact_dataset)
112+
summary = datasetSummary(artifact_dataset)
136113
summary_json = json.dumps(summary, indent=2, allow_nan=True)
137114
summary_path = artifact_dir / "datasetSummary.json"
138115
summary_path.write_text(summary_json, encoding="utf-8")

tests/symmetry/make_artifacts/dataset/test_download_ingested.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from ndi.dataset import Dataset
2626
from ndi.query import Query
27-
from ndi.util import sessionSummary
27+
from ndi.util import datasetSummary
2828
from tests.symmetry.conftest import PYTHON_ARTIFACTS
2929

3030
ARTIFACT_DIR = PYTHON_ARTIFACTS / "dataset" / "downloadIngested" / "testDownloadIngestedArtifacts"
@@ -99,31 +99,17 @@ def test_download_ingested_artifacts(self):
9999
# Open the dataset
100100
dataset = Dataset(dataset_path)
101101

102-
# Get session list
103-
ref_list, id_list, *_ = dataset.session_list()
104-
num_sessions = len(ref_list)
102+
# Build the dataset summary using the public utility
103+
dataset_summary = datasetSummary(dataset)
105104

106-
# Build session summaries for each session
107-
session_summaries = []
108-
for sid in id_list:
109-
sess = dataset.open_session(sid)
110-
session_summaries.append(sessionSummary(sess))
111-
112-
# Record document counts per session
105+
# Record document counts per session (extra field for this test)
106+
_ref_list, id_list, *_ = dataset.session_list()
113107
document_counts = []
114108
for sid in id_list:
115109
sess = dataset.open_session(sid)
116110
docs = sess.database_search(Query("base.id").match("(.*)"))
117111
document_counts.append({"sessionId": sid, "count": len(docs)})
118-
119-
# Build the dataset summary
120-
dataset_summary = {
121-
"numSessions": num_sessions,
122-
"references": ref_list,
123-
"sessionIds": id_list,
124-
"sessionSummaries": session_summaries,
125-
"documentCounts": document_counts,
126-
}
112+
dataset_summary["documentCounts"] = document_counts
127113

128114
# Write datasetSummary.json
129115
summary_json = json.dumps(dataset_summary, indent=2, allow_nan=True)

0 commit comments

Comments
 (0)