From ae1e6934899832a5300215cb2daf91406bac9a4c Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Wed, 13 May 2026 15:00:03 -0400
Subject: [PATCH 1/4] Add `common_paths` to input dataset config for
 non-subject files

Replaces the hard-coded `dataset_description.json` in sparse-checkout,
`datalad get`, and `datalad run -i` with a configurable `common_paths`
list on each input dataset entry. Defaults to `["dataset_description.json"]`
to preserve existing behaviour. An empty list disables all common-path
inclusion.

Closes #374

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 babs/input_dataset.py                    | 9 +++++++++
 babs/templates/participant_job.sh.jinja2 | 9 ++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/babs/input_dataset.py b/babs/input_dataset.py
index 45ec2ed6..ec138032 100644
--- a/babs/input_dataset.py
+++ b/babs/input_dataset.py
@@ -24,6 +24,7 @@ def __init__(
         is_zipped,
         unzipped_path_containing_subject_dirs=None,
         required_files=None,
+        common_paths=None,
         processing_level=None,
         babs_project_analysis_path=None,
     ):
@@ -43,6 +44,11 @@ def __init__(
             when unzipped, this string precedes the subject directories
         required_files: list of str or None
             list of required files in the input dataset
+        common_paths: list of str or None
+            paths relative to the dataset root to include in the sparse-checkout for every job,
+            in addition to the per-subject (and per-session) path.
+            Defaults to ``["dataset_description.json"]`` when ``None``.
+            Pass an empty list to disable all common-path inclusion.
         processing_level: {'subject', 'session'} or None
             whether processing is done on a subject-wise or session-wise basis
         babs_project_analysis_path: str or None
@@ -57,6 +63,7 @@ def __init__(
         else:
             self.is_zipped = bool(is_zipped)
         self.required_files = required_files
+        self.common_paths = ['dataset_description.json'] if common_paths is None else common_paths
         if processing_level not in ['subject', 'session']:
             raise ValueError('invalid `processing_level`!')
         self.processing_level = processing_level
@@ -269,6 +276,7 @@ def as_dict(self):
             'is_zipped': self.is_zipped,
             'unzipped_path_containing_subject_dirs': unzipped_path,
             'required_files': self.required_files,
+            'common_paths': self.common_paths,
             'processing_level': self.processing_level,
             'babs_project_analysis_path': self.babs_project_analysis_path,
         }
@@ -433,4 +441,5 @@ def __init__(self, input_dataset):
             input_dataset.unzipped_path_containing_subject_dirs
         )
         self.required_files = input_dataset.required_files
+        self.common_paths = input_dataset.common_paths
         self.processing_level = input_dataset.processing_level
diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2
index f3151cea..5a3683fa 100644
--- a/babs/templates/participant_job.sh.jinja2
+++ b/babs/templates/participant_job.sh.jinja2
@@ -87,7 +87,9 @@ echo "# Pull down the input session but don't retrieve data contents:"
 {% if not input_dataset['is_zipped'] %}
 datalad get -n "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"
 
-datalad get -n "{{ input_dataset['path_in_babs'] }}/dataset_description.json"
+{% for common_path in input_dataset['common_paths'] %}
+datalad get -n "{{ input_dataset['path_in_babs'] }}/{{ common_path }}"
+{% endfor %}
 {% else %}
 datalad get -n "{{ input_dataset['path_in_babs'] }}"
 {% endif %}
@@ -99,7 +101,7 @@ datalad get -n "{{ input_dataset['path_in_babs'] }}"
 if [ -d "{{ input_dataset['path_in_babs'] }}/.git" ]; then
   ( cd "{{ input_dataset['path_in_babs'] }}" && \
     ( git sparse-checkout init --no-cone 2>/dev/null && \
-      { echo "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"; echo 'dataset_description.json'; } | git sparse-checkout set --stdin 2>/dev/null ) ) || true
+      { echo "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"; {% for common_path in input_dataset['common_paths'] %}echo '{{ common_path }}'; {% endfor %}} | git sparse-checkout set --stdin 2>/dev/null ) ) || true
 fi
 {% endif %}
 {% endfor %}
@@ -137,7 +139,8 @@ datalad run \
 {% for input_dataset in input_datasets %}
 {% if not input_dataset['is_zipped'] %}
 	-i "{{ input_dataset['unzipped_path_containing_subject_dirs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \
-	-i "{{ input_dataset['path_in_babs'] }}/dataset_description.json" \
+{% for common_path in input_dataset['common_paths'] %}	-i "{{ input_dataset['path_in_babs'] }}/{{ common_path }}" \
+{% endfor %}
 {% else %}
 	-i "${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}" \
 {% endif %}

From fb21ddfa7f17a90925b691f9013a92d924efde2d Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Wed, 13 May 2026 16:00:21 -0400
Subject: [PATCH 2/4] adding echo

---
 babs/templates/participant_job.sh.jinja2 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2
index 5a3683fa..5f26e886 100644
--- a/babs/templates/participant_job.sh.jinja2
+++ b/babs/templates/participant_job.sh.jinja2
@@ -88,6 +88,7 @@ echo "# Pull down the input session but don't retrieve data contents:"
 datalad get -n "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"
 
 {% for common_path in input_dataset['common_paths'] %}
+echo "# Getting common path: {{ input_dataset['path_in_babs'] }}/{{ common_path }}"
 datalad get -n "{{ input_dataset['path_in_babs'] }}/{{ common_path }}"
 {% endfor %}
 {% else %}

From e3dafc819cbb73e337139e6955dd717d0292c1d4 Mon Sep 17 00:00:00 2001
From: Dorota Jarecka <djarecka@gmail.com>
Date: Wed, 13 May 2026 16:18:41 -0400
Subject: [PATCH 3/4] docs: document common_paths field in
 preparation_config_yaml_file.rst

Add common_paths to the section overview list and optional sections list,
add a stub required_files section, and add a full common_paths section
with examples and usage notes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/preparation_config_yaml_file.rst | 64 ++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/docs/preparation_config_yaml_file.rst b/docs/preparation_config_yaml_file.rst
index 767a95ac..e2593aea 100644
--- a/docs/preparation_config_yaml_file.rst
+++ b/docs/preparation_config_yaml_file.rst
@@ -29,6 +29,7 @@ Sections in the configuration YAML file
 * **all_results_in_one_zip**: whether to zip all results in one zip file;
 * **zip_foldernames**: the results foldername(s) to be zipped;
 * **required_files**: to only keep subjects (sessions) that have this list of required files in input dataset(s);
+* **common_paths**: dataset-root paths to include in the sparse-checkout for every job, in addition to the per-subject path (e.g., a shared ``phenotype/participants.tsv`` file);
 * **alert_log_messages**: alert messages in the log files that may be helpful for debugging errors in failed jobs;
 
 Among these sections, these sections are optional:
@@ -40,6 +41,7 @@ Among these sections, these sections are optional:
   * You must include this section if there are more one input dataset.
 
 * **required_files**
+* **common_paths**
 * **alert_log_messages**
 * **imported_files**
 
@@ -103,7 +105,7 @@ Example section **input_datasets**
             unzipped_path_containing_subject_dirs: "freesurfer"
             path_in_babs: inputs/data/freesurfer
 
-This example shows two input datasets: 
+This example shows two input datasets:
 one is a raw BIDS dataset, and the other is a zipped FreeSurfer results from another BABS project.
 Previously, the commandline to use something like this would have required::
 
@@ -773,4 +775,64 @@ Notes:
 
 .. _required_files:
 
+Section ``required_files``
+==========================
+
+.. note::
+
+    ``required_files`` is currently not fully implemented.
+    The field is accepted in the YAML file but filtering is not yet applied.
+
+.. _common-paths:
+
+Section ``common_paths``
+=========================
+
+The ``common_paths`` field lists paths (relative to an input dataset's root)
+that every job should include in the sparse-checkout and retrieve with
+``datalad get``, in addition to the per-subject (and per-session) path.
+This is useful when BIDS Apps or processing scripts need dataset-level files
+that live outside any individual subject directory.
+
+By default (when the field is omitted), BABS automatically includes
+``dataset_description.json`` for every non-zipped input dataset.
+Once you supply ``common_paths`` explicitly, the default is **replaced** —
+so if you still want ``dataset_description.json`` you must list it yourself.
+
+``common_paths`` is optional.  It is nested under the relevant input dataset
+entry inside the ``input_datasets`` section.
+
+Example — keep the default ``dataset_description.json`` **and** add a shared
+phenotype file:
+
+..  code-block:: yaml
+
+    input_datasets:
+        BIDS:
+            is_zipped: false
+            origin_url: "/path/to/BIDS"
+            path_in_babs: inputs/data/BIDS
+            common_paths:
+                - "phenotype/participants.tsv"
+                - "dataset_description.json"
+
+Example — disable all common-path retrieval (pass an empty list):
+
+..  code-block:: yaml
+
+    input_datasets:
+        BIDS:
+            is_zipped: false
+            origin_url: "/path/to/BIDS"
+            path_in_babs: inputs/data/BIDS
+            common_paths: []
+
+Notes:
+
+* Paths are relative to the input dataset root (e.g., ``"phenotype/participants.tsv"``
+  not ``"inputs/data/BIDS/phenotype/participants.tsv"``).
+* Each path is retrieved individually with ``datalad get -n`` so you can track
+  exactly which files are fetched in the job log.
+* This field has no effect on zipped input datasets.
+
 

From 443dcf66a0d2a0b79ee8a99fddceced9186639ac Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Thu, 21 May 2026 08:59:56 -0500
Subject: [PATCH 4/4] Fixup tests

Add common paths to the submit script test inputs.

No need to add to the zipped ones, they are skipped.
---
 tests/test_generate_submit_script.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_generate_submit_script.py b/tests/test_generate_submit_script.py
index 8f4e832b..7ee9f610 100644
--- a/tests/test_generate_submit_script.py
+++ b/tests/test_generate_submit_script.py
@@ -14,6 +14,7 @@
         'path_in_babs': 'inputs/data/BIDS',
         'unzipped_path_containing_subject_dirs': 'inputs/data/BIDS',
         'is_zipped': False,
+        'common_paths': ['dataset_description.json'],
     },
 ]
 
@@ -29,6 +30,7 @@
         'path_in_babs': 'inputs/data/BIDS',
         'unzipped_path_containing_subject_dirs': 'inputs/data/BIDS',
         'is_zipped': False,
+        'common_paths': [],
     },
 ]