From 6c833886ff873eb1490a249fe9cdfe123361479d Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 8 Jun 2026 19:33:47 -0500 Subject: [PATCH 01/16] templates: add pre_app/post_run hook splice points to participant_job Add two optional, ordered hook lists to the participant job script: - pre_app: spliced before the `datalad run` wrapper - post_run: spliced after the run, before the push to output storage Each splice region is a subshell that exports the contract vars (subid, BRANCH, PROJECT_ROOT, JOB_SCRATCH_DIR; plus sesid at session level) so separate-process hook scripts read them by name, while the export cannot leak into the datalad run / container. processing_level is intentionally not exported: it is a render-time constant, and sesid presence already distinguishes subject vs session. Hooks reach the template as list[str]; resolving the YAML hooks: block into those strings is a later step. With no hooks configured the render is byte-identical to before (jinja {% if %} guard + trim/lstrip). Thread hook_pre_app/hook_post_run (default None) through generate_submit_script(); both existing call sites are unaffected. Add render tests: no-hooks (None == [], no markers, shellcheck), pre_app+post_run expansion (contract export, snippet order, position around the run wrapper, shellcheck), and session-only sesid export. Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/generate_submit_script.py | 10 +++ babs/templates/participant_job.sh.jinja2 | 24 +++++++ tests/test_generate_submit_script.py | 86 ++++++++++++++++++++++++ 3 files changed, 120 insertions(+) diff --git a/babs/generate_submit_script.py b/babs/generate_submit_script.py index 8242a4e2..1f4783e9 100644 --- a/babs/generate_submit_script.py +++ b/babs/generate_submit_script.py @@ -30,6 +30,8 @@ def generate_submit_script( container_images=None, datalad_run_message=None, project_root=None, + hook_pre_app=None, + hook_post_run=None, ): """ Generate a bash script that runs the BIDS App singularity image. @@ -62,6 +64,12 @@ def generate_submit_script( Absolute path to the BABS project root (parent of `analysis/`). Passed to the template; used in the error message when PROJECT_ROOT is unset. If None, the placeholder ``{project_root}`` is shown. + hook_pre_app : list of str, optional + Shell snippets spliced into a subshell just before the ``datalad run`` + wrapper. None (or empty) renders nothing. Snippets are emitted in order. + hook_post_run : list of str, optional + Shell snippets spliced into a subshell just after the ``datalad run`` + wrapper, before the push. None (or empty) renders nothing. Returns ------- @@ -129,6 +137,8 @@ def generate_submit_script( container_image_paths=container_image_paths, datalad_run_message=datalad_run_message, project_root=project_root, + hook_pre_app=hook_pre_app, + hook_post_run=hook_post_run, ) diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index f3151cea..eac3ee66 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -130,6 +130,18 @@ for CONTAINER_JOB in "${CONTAINER_IMAGE_PATHS[@]}"; do exit 1 fi done +{% if hook_pre_app %} +# pre_app hooks: spliced before the datalad run wrapper; subshell exports the contract. +( + export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR +{% if processing_level == 'session' %} + export sesid +{% endif %} +{% for snippet in hook_pre_app %} +{{ snippet }} +{% endfor %} +) +{% endif %} # datalad run: datalad run \ @@ -156,6 +168,18 @@ datalad run \ {% endif %} -m "{{ (datalad_run_message if datalad_run_message is defined and datalad_run_message else container_name) }} {% raw %}${subid}{% endraw %}{% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}" \ "bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }} {% raw %}${subid}{% endraw %} {% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}" +{% if hook_post_run %} +# post_run hooks: spliced after the run, before push; subshell exports the contract. +( + export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR +{% if processing_level == 'session' %} + export sesid +{% endif %} +{% for snippet in hook_post_run %} +{{ snippet }} +{% endfor %} +) +{% endif %} # Finish up: # push result file content to output RIA storage: diff --git a/tests/test_generate_submit_script.py b/tests/test_generate_submit_script.py index 8f4e832b..ebf07087 100644 --- a/tests/test_generate_submit_script.py +++ b/tests/test_generate_submit_script.py @@ -116,6 +116,92 @@ def test_generate_submit_script(input_datasets, config_file, processing_level, t assert passed, status +# --------------------------------------------------------------------------- +# pre_app / post_run splice-point hooks +# --------------------------------------------------------------------------- + +PRE_APP_MARKER = '# pre_app hooks:' +POST_RUN_MARKER = '# post_run hooks:' +CONTRACT_EXPORT = 'export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR' + + +def _render_with_hooks(processing_level='subject', **hook_kwargs): + """Render participant_job.sh for a simple single-app config, with optional hooks.""" + config = read_yaml(NOTEBOOKS_DIR / 'eg_fmriprep-24-1-1_regular.yaml') + return generate_submit_script( + queue_system='slurm', + cluster_resources_config=config['cluster_resources'], + script_preamble=config['script_preamble'], + job_scratch_directory=config['job_compute_space'], + input_datasets=input_datasets_prep, + processing_level=processing_level, + container_name='fmriprep-24-1-1', + zip_foldernames=config['zip_foldernames'], + **hook_kwargs, + ) + + +def _pre_app_block(text): + """Slice out the rendered pre_app subshell (marker through its closing paren).""" + start = text.index(PRE_APP_MARKER) + return text[start : text.index('\n)\n', start) + 2] + + +def test_hooks_absent_when_not_configured(tmp_path): + """No hooks configured: no splice blocks rendered, and None behaves like [].""" + default_render = _render_with_hooks() + empty_render = _render_with_hooks(hook_pre_app=[], hook_post_run=[]) + + assert default_render == empty_render # None and [] are both no-ops + assert PRE_APP_MARKER not in default_render + assert POST_RUN_MARKER not in default_render + + out_fn = tmp_path / 'participant_job_nohooks.sh' + out_fn.write_text(default_render) + passed, status = run_shellcheck(str(out_fn)) + assert passed, status + + +def test_pre_app_and_post_run_hooks_spliced(tmp_path): + """Configured hooks render as subshells exporting the contract, in order, + positioned around the datalad run wrapper.""" + text = _render_with_hooks( + hook_pre_app=['echo PRE_A', 'echo PRE_B'], + hook_post_run=['echo POST_ONE'], + ) + + assert PRE_APP_MARKER in text + assert POST_RUN_MARKER in text + # the contract is exported once per splice subshell + assert text.count(CONTRACT_EXPORT) == 2 + # snippets appear, in the configured order + assert text.index('echo PRE_A') < text.index('echo PRE_B') + assert 'echo POST_ONE' in text + + # pre_app sits before the run; post_run after it, before the push + i_pre = text.index(PRE_APP_MARKER) + i_run = text.index('\ndatalad run ') + i_post = text.index(POST_RUN_MARKER) + i_push = text.index('datalad push --to output-storage') + assert i_pre < i_run < i_post < i_push + + out_fn = tmp_path / 'participant_job_hooks.sh' + out_fn.write_text(text) + passed, status = run_shellcheck(str(out_fn)) + if not passed: + print(text) + assert passed, status + + +def test_hooks_export_sesid_only_at_session_level(): + """sesid joins the exported contract only for session-level processing.""" + session = _render_with_hooks(processing_level='session', hook_pre_app=['echo HI']) + subject = _render_with_hooks(processing_level='subject', hook_pre_app=['echo HI']) + + assert 'export sesid' in _pre_app_block(session) + assert 'export sesid' not in _pre_app_block(subject) + + def run_shellcheck(script_path): """Run shellcheck on a shell script string and return the result. From a5ec6a99373faa75f22ac6f5db7dd2af17910ecf Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 10:20:32 -0500 Subject: [PATCH 02/16] hooks: add resolve_hooks for `hooks:` config (forms a/b) First resolution step for the participant_job splice points (8b4185a): turn a `hooks:` config block into the command strings those splice points consume, plus the files babs materializes at init. This is what lets cross-cutting concerns -- optional zip, NORDIC, a BIDS-validator gate -- live as user-configurable hooks in one codepath instead of diverged templates. Design: PennLINC/babs#365. babs/hooks.py (pure, no I/O): resolve_hooks(hooks_config, *, source_base) classifies each entry into Verbatim (form a, raw snippet) or CopyIn (form b, `{script: }` copied to code/hooks/.sh). Render (templated built-ins / container hooks) is defined as the forward-compat seam but not yet produced. Fail-fast ValueError on bad config. CopyIn.as_import() emits the {original_path, analysis_path} shape _init_import_files already consumes, so a later slice wires materialization with no duplicated copy logic. tests/test_hooks.py: 21 unit tests (run in the slurm-docker-ci container; conftest pulls slurm deps at collection). Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/hooks.py | 206 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_hooks.py | 125 +++++++++++++++++++++++++++ 2 files changed, 331 insertions(+) create mode 100644 babs/hooks.py create mode 100644 tests/test_hooks.py diff --git a/babs/hooks.py b/babs/hooks.py new file mode 100644 index 00000000..caa4c3f7 --- /dev/null +++ b/babs/hooks.py @@ -0,0 +1,206 @@ +"""Resolve a `hooks:` config block into spliceable commands + materializations. + +A BABS container config may carry a top-level ``hooks:`` block with +``pre_app:`` / ``post_run:`` lists. Each entry names a snippet of work to run +at the corresponding splice point in ``participant_job.sh`` (added in the +splice-points step). This module turns those config entries into: + +- the **runtime command strings** spliced into the job script, and +- the **file materializations** BABS must perform at ``babs init`` (copying a + user script into ``code/hooks/.sh``). + +`resolve_hooks` is **pure** (config in -> commands + descriptors out, no I/O); +the caller (`Bootstrap`) performs the materializations. + +Entry forms supported in this version: + +- **(a) raw snippet** -- a bare string, spliced verbatim (``Verbatim``). +- **(b) user script** -- ``{script: }``; copied into + ``code/hooks/.sh`` and invoked as ``bash ./code/hooks/.sh`` + (``CopyIn``). The destination name is the source basename; two sources sharing + a basename collide (no override yet -- add one if it proves needed). + +`Render` (rendering a shipped ``*.sh.jinja2`` through the shared singularity +partial) is defined here as the forward-compatible seam, but `resolve_hooks` +does not yet produce one -- entries requiring it are rejected. See the +pipeline-of-one design notes for the builtin/container forms it will carry. +""" + +import os.path as op +from dataclasses import dataclass, field + +# Where copied/rendered hook scripts land inside the analysis dataset. +HOOKS_SUBDIR = op.join('code', 'hooks') + +# The splice points a hooks config may target, in run order. +SPLICE_POINTS = ('pre_app', 'post_run') + + +@dataclass(frozen=True) +class Verbatim: + """Form (a): a raw shell snippet spliced inline. No file materialization.""" + + command: str + + +@dataclass(frozen=True) +class CopyIn: + """Form (b) / static built-in: copy ``original_path`` -> ``code/hooks/.sh``.""" + + original_path: str + name: str + + @property + def analysis_path(self): + """Destination relative to the analysis dataset root.""" + return op.join(HOOKS_SUBDIR, f'{self.name}.sh') + + def as_import(self): + """Shape consumed by ``Bootstrap._init_import_files``.""" + return {'original_path': self.original_path, 'analysis_path': self.analysis_path} + + +@dataclass(frozen=True) +class Render: + """Form (c) / templated built-in: render ``template_path`` -> ``code/hooks/.sh``. + + Defined as the forward-compatible seam; `resolve_hooks` does not yet produce + one (wired in a later step alongside the shared singularity partial). + """ + + template_path: str + name: str + context: dict = field(default_factory=dict) + + @property + def analysis_path(self): + """Destination relative to the analysis dataset root.""" + return op.join(HOOKS_SUBDIR, f'{self.name}.sh') + + +def _validate_name(name, entry): + """Reject a hook name that would escape ``code/hooks/`` or is empty.""" + if not name or name in ('.', '..') or op.basename(name) != name: + raise ValueError( + f'Invalid hook name {name!r} from entry {entry!r}: a hook name must be ' + 'a bare filename (no path separators, not "." or "..").' + ) + + +def _default_name(source): + """Derive a hook name from a source path: basename with a trailing ``.sh`` dropped.""" + base = op.basename(source) + return base[:-3] if base.endswith('.sh') else base + + +def _resolve_entry(entry, source_base): + """Classify a single config entry into a materialization mode. + + Parameters + ---------- + entry : str or dict + One item from a ``pre_app:`` / ``post_run:`` list. + source_base : str + Directory that relative ``script:`` paths resolve against (the + ``babs init`` invocation cwd). Absolute paths pass through unchanged. + + Returns + ------- + Verbatim or CopyIn + The classified mode. (`Render` is not produced in this version.) + """ + if isinstance(entry, str): + return Verbatim(command=entry) + + if isinstance(entry, dict) and 'script' in entry: + unknown = set(entry) - {'script'} + if unknown: + raise ValueError( + f'Unsupported key(s) {sorted(unknown)} in script hook entry {entry!r}; ' + 'expected only "script".' + ) + source = entry['script'] + # The destination name is derived from the source basename; there is no + # override yet, so two sources with the same basename collide (see below). + name = _default_name(source) + _validate_name(name, entry) + original_path = source if op.isabs(source) else op.join(source_base, source) + return CopyIn(original_path=original_path, name=name) + + raise ValueError( + f'Unsupported hook entry: {entry!r}. This version supports a raw shell ' + 'string or a {script: } mapping.' + ) + + +def _command_for(mode): + """The runtime command string a classified mode contributes to its splice list.""" + if isinstance(mode, Verbatim): + return mode.command + if isinstance(mode, (CopyIn, Render)): + return f'bash ./{mode.analysis_path}' + raise TypeError(f'Unknown hook mode: {mode!r}') + + +def resolve_hooks(hooks_config, *, source_base): + """Resolve a ``hooks:`` config block into spliceable commands + materializations. + + Parameters + ---------- + hooks_config : dict or None + The top-level ``hooks:`` block (``{'pre_app': [...], 'post_run': [...]}``), + or ``None`` when no hooks are configured. + source_base : str + Directory that relative ``script:`` paths resolve against (the + ``babs init`` invocation cwd). + + Returns + ------- + pre_app : list of str + Command strings to splice at the ``pre_app`` point, in order. + post_run : list of str + Command strings to splice at the ``post_run`` point, in order. + materializations : list of (CopyIn or Render) + Files BABS must copy/render into ``code/hooks/`` at ``babs init``. + `Verbatim` entries contribute no materialization. + + Raises + ------ + ValueError + On an unknown splice-point key, an unsupported entry form, an invalid + hook name, or a name collision across the (shared) ``code/hooks/`` + namespace. + """ + if hooks_config is None: + return [], [], [] + if not isinstance(hooks_config, dict): + raise ValueError( + f'`hooks:` must be a mapping with keys {list(SPLICE_POINTS)}, ' + f'got {hooks_config!r}.' + ) + unknown_points = set(hooks_config) - set(SPLICE_POINTS) + if unknown_points: + raise ValueError( + f'Unknown hook splice point(s) {sorted(unknown_points)}; ' + f'expected only {list(SPLICE_POINTS)}.' + ) + + resolved = {point: [] for point in SPLICE_POINTS} + materializations = [] + seen_names = {} # name -> splice point that claimed it (for collision messages) + + for point in SPLICE_POINTS: + for entry in hooks_config.get(point) or []: + mode = _resolve_entry(entry, source_base=source_base) + resolved[point].append(_command_for(mode)) + if isinstance(mode, (CopyIn, Render)): + if mode.name in seen_names: + raise ValueError( + f'Duplicate hook name {mode.name!r} (in {point!r} and ' + f'{seen_names[mode.name]!r}); both would write ' + f'{op.join(HOOKS_SUBDIR, mode.name + ".sh")!r}.' + ) + seen_names[mode.name] = point + materializations.append(mode) + + return resolved['pre_app'], resolved['post_run'], materializations diff --git a/tests/test_hooks.py b/tests/test_hooks.py new file mode 100644 index 00000000..828418fc --- /dev/null +++ b/tests/test_hooks.py @@ -0,0 +1,125 @@ +"""Unit tests for `babs.hooks.resolve_hooks` (pure; no I/O, no docker).""" + +import pytest + +from babs.hooks import ( + CopyIn, + Render, + Verbatim, + resolve_hooks, +) + + +def test_none_config_resolves_empty(): + assert resolve_hooks(None, source_base='/base') == ([], [], []) + + +def test_empty_config_resolves_empty(): + assert resolve_hooks({}, source_base='/base') == ([], [], []) + + +def test_missing_or_empty_lists_resolve_empty(): + cfg = {'pre_app': None, 'post_run': []} + assert resolve_hooks(cfg, source_base='/base') == ([], [], []) + + +def test_raw_snippet_is_verbatim_with_no_materialization(): + cfg = {'pre_app': ['echo hello']} + pre_app, post_run, materializations = resolve_hooks(cfg, source_base='/base') + assert pre_app == ['echo hello'] + assert post_run == [] + assert materializations == [] + + +def test_script_relative_path_resolved_against_source_base(): + cfg = {'pre_app': [{'script': 'hooks/validate.sh'}]} + pre_app, _, materializations = resolve_hooks(cfg, source_base='/proj') + assert pre_app == ['bash ./code/hooks/validate.sh'] + assert materializations == [ + CopyIn(original_path='/proj/hooks/validate.sh', name='validate') + ] + assert materializations[0].as_import() == { + 'original_path': '/proj/hooks/validate.sh', + 'analysis_path': 'code/hooks/validate.sh', + } + + +def test_script_absolute_path_passes_through(): + cfg = {'post_run': [{'script': '/abs/path/zip.sh'}]} + _, post_run, materializations = resolve_hooks(cfg, source_base='/proj') + assert post_run == ['bash ./code/hooks/zip.sh'] + assert materializations[0].original_path == '/abs/path/zip.sh' + + +def test_default_name_only_strips_trailing_sh(): + # a source without a .sh suffix keeps its basename as the name + cfg = {'pre_app': [{'script': 'tools/runme'}]} + pre_app, _, _ = resolve_hooks(cfg, source_base='/proj') + assert pre_app == ['bash ./code/hooks/runme.sh'] + + +def test_order_preserved_within_and_across_splice_points(): + cfg = { + 'pre_app': ['echo a', 'echo b'], + 'post_run': ['echo c'], + } + pre_app, post_run, _ = resolve_hooks(cfg, source_base='/base') + assert pre_app == ['echo a', 'echo b'] + assert post_run == ['echo c'] + + +def test_name_collision_across_splice_points_raises(): + cfg = { + 'pre_app': [{'script': 'a/validate.sh'}], + 'post_run': [{'script': 'b/validate.sh'}], + } + with pytest.raises(ValueError, match='Duplicate hook name'): + resolve_hooks(cfg, source_base='/base') + + +def test_unknown_splice_point_raises(): + with pytest.raises(ValueError, match='Unknown hook splice point'): + resolve_hooks({'pre-app': ['echo x']}, source_base='/base') + + +def test_non_mapping_config_raises(): + with pytest.raises(ValueError, match='must be a mapping'): + resolve_hooks(['echo x'], source_base='/base') + + +@pytest.mark.parametrize('entry', [{'builtin': 'zip'}, {'container': 'nordic'}, 42]) +def test_unsupported_entry_forms_raise(entry): + with pytest.raises(ValueError, match='Unsupported hook entry'): + resolve_hooks({'pre_app': [entry]}, source_base='/base') + + +@pytest.mark.parametrize('extra_key', ['name', 'singularity_args']) +def test_unknown_key_in_script_entry_raises(extra_key): + # `name` is rejected too: there is no override in this version. + cfg = {'pre_app': [{'script': 'x.sh', extra_key: 'whatever'}]} + with pytest.raises(ValueError, match='Unsupported key'): + resolve_hooks(cfg, source_base='/base') + + +@pytest.mark.parametrize('bad_source', ['some/dir/', '..', '.']) +def test_source_with_invalid_derived_name_raises(bad_source): + # The destination name is the source basename; a source whose basename is + # empty or '.'/'..' has no usable name. + cfg = {'pre_app': [{'script': bad_source}]} + with pytest.raises(ValueError, match='Invalid hook name'): + resolve_hooks(cfg, source_base='/base') + + +def test_render_is_defined_but_never_produced(): + # The forward-compat seam exists as a type... + r = Render(template_path='t.sh.jinja2', name='zip', context={'k': 'v'}) + assert r.analysis_path == 'code/hooks/zip.sh' + # ...but resolve_hooks never returns one in this version (no config form maps to it). + _, _, materializations = resolve_hooks( + {'post_run': ['echo verbatim', {'script': 'x.sh'}]}, source_base='/base' + ) + assert all(isinstance(m, CopyIn) for m in materializations) + + +def test_verbatim_dataclass_shape(): + assert Verbatim(command='echo hi').command == 'echo hi' From 1d3952ddb878d98cdc770311478fc50bf85722a6 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 10:27:55 -0500 Subject: [PATCH 03/16] hooks: collide on conflicting content, not on name reuse A hook collision is about the materialized file, not the name alone. Key the check on descriptor equality: the *same* hook reused at multiple splice points (e.g. a BIDS validator at both pre_app and post_run) is copied once and referenced from each list, while two *different* descriptors claiming the same code/hooks/.sh still raise. Because Render carries `context`, this also catches one template rendered two ways into the same name (a real conflict) once Render is wired. tests: same-source-at-both-points materializes once; different-source-same- name still collides; Render equality distinguishes context (pins the invariant the rule depends on, since resolve_hooks doesn't emit Render yet). Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/hooks.py | 31 ++++++++++++++++++++++--------- tests/test_hooks.py | 25 ++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/babs/hooks.py b/babs/hooks.py index caa4c3f7..f558814e 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -17,8 +17,10 @@ - **(a) raw snippet** -- a bare string, spliced verbatim (``Verbatim``). - **(b) user script** -- ``{script: }``; copied into ``code/hooks/.sh`` and invoked as ``bash ./code/hooks/.sh`` - (``CopyIn``). The destination name is the source basename; two sources sharing - a basename collide (no override yet -- add one if it proves needed). + (``CopyIn``). The destination name is the source basename. The *same* script + may appear at multiple splice points (e.g. a validator at ``pre_app`` and + ``post_run``) -- it is copied once and referenced from each. Two *different* + sources sharing a basename collide (no name override yet -- add one if needed). `Render` (rendering a shipped ``*.sh.jinja2`` through the shared singularity partial) is defined here as the forward-compatible seam, but `resolve_hooks` @@ -168,8 +170,9 @@ def resolve_hooks(hooks_config, *, source_base): ------ ValueError On an unknown splice-point key, an unsupported entry form, an invalid - hook name, or a name collision across the (shared) ``code/hooks/`` - namespace. + hook name, or two *different* hooks resolving to the same name in the + (shared) ``code/hooks/`` namespace. The identical hook reused at + multiple splice points is allowed (materialized once). """ if hooks_config is None: return [], [], [] @@ -187,20 +190,30 @@ def resolve_hooks(hooks_config, *, source_base): resolved = {point: [] for point in SPLICE_POINTS} materializations = [] - seen_names = {} # name -> splice point that claimed it (for collision messages) + seen = {} # name -> (descriptor, splice point that first claimed it) for point in SPLICE_POINTS: for entry in hooks_config.get(point) or []: mode = _resolve_entry(entry, source_base=source_base) resolved[point].append(_command_for(mode)) if isinstance(mode, (CopyIn, Render)): - if mode.name in seen_names: + # Collision is about the materialized file, not the name alone: + # the *same* hook used at multiple splice points (identical + # descriptor) is fine -- copy/render once, reference it from + # each list. Only a *different* descriptor claiming the same + # name is a real conflict (one would clobber the other). For a + # Render this includes a differing `context` -- same template + # rendered two ways into one file collides. + if mode.name in seen: + prior, prior_point = seen[mode.name] + if mode == prior: + continue # same file; already materialized raise ValueError( - f'Duplicate hook name {mode.name!r} (in {point!r} and ' - f'{seen_names[mode.name]!r}); both would write ' + f'Duplicate hook name {mode.name!r} (in {prior_point!r} and ' + f'{point!r}) resolves to different content; both would write ' f'{op.join(HOOKS_SUBDIR, mode.name + ".sh")!r}.' ) - seen_names[mode.name] = point + seen[mode.name] = (mode, point) materializations.append(mode) return resolved['pre_app'], resolved['post_run'], materializations diff --git a/tests/test_hooks.py b/tests/test_hooks.py index 828418fc..ee9a9b96 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -68,7 +68,7 @@ def test_order_preserved_within_and_across_splice_points(): assert post_run == ['echo c'] -def test_name_collision_across_splice_points_raises(): +def test_different_sources_same_name_collide(): cfg = { 'pre_app': [{'script': 'a/validate.sh'}], 'post_run': [{'script': 'b/validate.sh'}], @@ -77,6 +77,29 @@ def test_name_collision_across_splice_points_raises(): resolve_hooks(cfg, source_base='/base') +def test_same_script_at_both_points_materializes_once(): + # The identical hook reused at pre_app and post_run (e.g. a validator) is + # copied once and referenced from each list -- not a collision. + cfg = { + 'pre_app': [{'script': 'hooks/validate.sh'}], + 'post_run': [{'script': 'hooks/validate.sh'}], + } + pre_app, post_run, materializations = resolve_hooks(cfg, source_base='/proj') + assert pre_app == ['bash ./code/hooks/validate.sh'] + assert post_run == ['bash ./code/hooks/validate.sh'] + assert materializations == [ + CopyIn(original_path='/proj/hooks/validate.sh', name='validate') + ] + + +def test_render_equality_distinguishes_context(): + # The collision rule keys on descriptor equality; Render carries `context`, + # so the same template rendered two ways into one name is a real conflict. + # (Render isn't produced by resolve_hooks yet -- this pins the invariant.) + assert Render('zip.sh.jinja2', 'zip', {'a': 1}) == Render('zip.sh.jinja2', 'zip', {'a': 1}) + assert Render('zip.sh.jinja2', 'zip', {'a': 1}) != Render('zip.sh.jinja2', 'zip', {'a': 2}) + + def test_unknown_splice_point_raises(): with pytest.raises(ValueError, match='Unknown hook splice point'): resolve_hooks({'pre-app': ['echo x']}, source_base='/base') From d896aacb89060dd7e475678a7af12daeae2a023e Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 11:37:56 -0500 Subject: [PATCH 04/16] hooks: script: is an absolute path used verbatim (drop source_base) Follow babs's existing convention for copied-in local files instead of inventing a parallel one. `imported_files.original_path` -- which form (b) reuses via _init_import_files -- is an absolute local path used as-is (op.exists + copy, no cwd-join, no abspath validation); mechababs likewise resolves the inclusion file to absolute before passing it ("so datalad run cp doesn't need the cwd to match"). So resolve_hooks no longer takes source_base and no longer joins: CopyIn.original_path is the `script:` value verbatim, and _init_import_files raises FileNotFoundError on a bad path exactly as it does for imported_files today. This also lands on the right side of the path-handling review on PennLINC/babs#369: source paths are absolute (the explicitly-supported copy-in case), while the destination (code/hooks/.sh) stays relative and is containment-validated by _validate_name. Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/hooks.py | 28 +++++++++++----------- tests/test_hooks.py | 57 +++++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 46 deletions(-) diff --git a/babs/hooks.py b/babs/hooks.py index f558814e..fdf6003c 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -17,10 +17,12 @@ - **(a) raw snippet** -- a bare string, spliced verbatim (``Verbatim``). - **(b) user script** -- ``{script: }``; copied into ``code/hooks/.sh`` and invoked as ``bash ./code/hooks/.sh`` - (``CopyIn``). The destination name is the source basename. The *same* script - may appear at multiple splice points (e.g. a validator at ``pre_app`` and - ``post_run``) -- it is copied once and referenced from each. Two *different* - sources sharing a basename collide (no name override yet -- add one if needed). + (``CopyIn``). ```` is an absolute local path used verbatim -- the same + convention as ``imported_files.original_path``, which this reuses. The + destination name is the source basename. The *same* script may appear at + multiple splice points (e.g. a validator at ``pre_app`` and ``post_run``) -- + it is copied once and referenced from each. Two *different* sources sharing a + basename collide (no name override yet -- add one if needed). `Render` (rendering a shipped ``*.sh.jinja2`` through the shared singularity partial) is defined here as the forward-compatible seam, but `resolve_hooks` @@ -95,16 +97,13 @@ def _default_name(source): return base[:-3] if base.endswith('.sh') else base -def _resolve_entry(entry, source_base): +def _resolve_entry(entry): """Classify a single config entry into a materialization mode. Parameters ---------- entry : str or dict One item from a ``pre_app:`` / ``post_run:`` list. - source_base : str - Directory that relative ``script:`` paths resolve against (the - ``babs init`` invocation cwd). Absolute paths pass through unchanged. Returns ------- @@ -126,8 +125,10 @@ def _resolve_entry(entry, source_base): # override yet, so two sources with the same basename collide (see below). name = _default_name(source) _validate_name(name, entry) - original_path = source if op.isabs(source) else op.join(source_base, source) - return CopyIn(original_path=original_path, name=name) + # `source` is used verbatim as the copy source -- same convention as + # `imported_files.original_path` (an absolute local path; resolved by + # `_init_import_files`, which raises FileNotFoundError on a bad path). + return CopyIn(original_path=source, name=name) raise ValueError( f'Unsupported hook entry: {entry!r}. This version supports a raw shell ' @@ -144,7 +145,7 @@ def _command_for(mode): raise TypeError(f'Unknown hook mode: {mode!r}') -def resolve_hooks(hooks_config, *, source_base): +def resolve_hooks(hooks_config): """Resolve a ``hooks:`` config block into spliceable commands + materializations. Parameters @@ -152,9 +153,6 @@ def resolve_hooks(hooks_config, *, source_base): hooks_config : dict or None The top-level ``hooks:`` block (``{'pre_app': [...], 'post_run': [...]}``), or ``None`` when no hooks are configured. - source_base : str - Directory that relative ``script:`` paths resolve against (the - ``babs init`` invocation cwd). Returns ------- @@ -194,7 +192,7 @@ def resolve_hooks(hooks_config, *, source_base): for point in SPLICE_POINTS: for entry in hooks_config.get(point) or []: - mode = _resolve_entry(entry, source_base=source_base) + mode = _resolve_entry(entry) resolved[point].append(_command_for(mode)) if isinstance(mode, (CopyIn, Render)): # Collision is about the materialized file, not the name alone: diff --git a/tests/test_hooks.py b/tests/test_hooks.py index ee9a9b96..8cc2def9 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -11,29 +11,31 @@ def test_none_config_resolves_empty(): - assert resolve_hooks(None, source_base='/base') == ([], [], []) + assert resolve_hooks(None) == ([], [], []) def test_empty_config_resolves_empty(): - assert resolve_hooks({}, source_base='/base') == ([], [], []) + assert resolve_hooks({}) == ([], [], []) def test_missing_or_empty_lists_resolve_empty(): cfg = {'pre_app': None, 'post_run': []} - assert resolve_hooks(cfg, source_base='/base') == ([], [], []) + assert resolve_hooks(cfg) == ([], [], []) def test_raw_snippet_is_verbatim_with_no_materialization(): cfg = {'pre_app': ['echo hello']} - pre_app, post_run, materializations = resolve_hooks(cfg, source_base='/base') + pre_app, post_run, materializations = resolve_hooks(cfg) assert pre_app == ['echo hello'] assert post_run == [] assert materializations == [] -def test_script_relative_path_resolved_against_source_base(): - cfg = {'pre_app': [{'script': 'hooks/validate.sh'}]} - pre_app, _, materializations = resolve_hooks(cfg, source_base='/proj') +def test_script_path_used_verbatim(): + # `script:` is an absolute local path used as-is (like imported_files); + # the destination name is its basename. + cfg = {'pre_app': [{'script': '/proj/hooks/validate.sh'}]} + pre_app, _, materializations = resolve_hooks(cfg) assert pre_app == ['bash ./code/hooks/validate.sh'] assert materializations == [ CopyIn(original_path='/proj/hooks/validate.sh', name='validate') @@ -44,17 +46,10 @@ def test_script_relative_path_resolved_against_source_base(): } -def test_script_absolute_path_passes_through(): - cfg = {'post_run': [{'script': '/abs/path/zip.sh'}]} - _, post_run, materializations = resolve_hooks(cfg, source_base='/proj') - assert post_run == ['bash ./code/hooks/zip.sh'] - assert materializations[0].original_path == '/abs/path/zip.sh' - - def test_default_name_only_strips_trailing_sh(): # a source without a .sh suffix keeps its basename as the name - cfg = {'pre_app': [{'script': 'tools/runme'}]} - pre_app, _, _ = resolve_hooks(cfg, source_base='/proj') + cfg = {'pre_app': [{'script': '/tools/runme'}]} + pre_app, _, _ = resolve_hooks(cfg) assert pre_app == ['bash ./code/hooks/runme.sh'] @@ -63,28 +58,28 @@ def test_order_preserved_within_and_across_splice_points(): 'pre_app': ['echo a', 'echo b'], 'post_run': ['echo c'], } - pre_app, post_run, _ = resolve_hooks(cfg, source_base='/base') + pre_app, post_run, _ = resolve_hooks(cfg) assert pre_app == ['echo a', 'echo b'] assert post_run == ['echo c'] def test_different_sources_same_name_collide(): cfg = { - 'pre_app': [{'script': 'a/validate.sh'}], - 'post_run': [{'script': 'b/validate.sh'}], + 'pre_app': [{'script': '/a/validate.sh'}], + 'post_run': [{'script': '/b/validate.sh'}], } with pytest.raises(ValueError, match='Duplicate hook name'): - resolve_hooks(cfg, source_base='/base') + resolve_hooks(cfg) def test_same_script_at_both_points_materializes_once(): # The identical hook reused at pre_app and post_run (e.g. a validator) is # copied once and referenced from each list -- not a collision. cfg = { - 'pre_app': [{'script': 'hooks/validate.sh'}], - 'post_run': [{'script': 'hooks/validate.sh'}], + 'pre_app': [{'script': '/proj/hooks/validate.sh'}], + 'post_run': [{'script': '/proj/hooks/validate.sh'}], } - pre_app, post_run, materializations = resolve_hooks(cfg, source_base='/proj') + pre_app, post_run, materializations = resolve_hooks(cfg) assert pre_app == ['bash ./code/hooks/validate.sh'] assert post_run == ['bash ./code/hooks/validate.sh'] assert materializations == [ @@ -102,35 +97,35 @@ def test_render_equality_distinguishes_context(): def test_unknown_splice_point_raises(): with pytest.raises(ValueError, match='Unknown hook splice point'): - resolve_hooks({'pre-app': ['echo x']}, source_base='/base') + resolve_hooks({'pre-app': ['echo x']}) def test_non_mapping_config_raises(): with pytest.raises(ValueError, match='must be a mapping'): - resolve_hooks(['echo x'], source_base='/base') + resolve_hooks(['echo x']) @pytest.mark.parametrize('entry', [{'builtin': 'zip'}, {'container': 'nordic'}, 42]) def test_unsupported_entry_forms_raise(entry): with pytest.raises(ValueError, match='Unsupported hook entry'): - resolve_hooks({'pre_app': [entry]}, source_base='/base') + resolve_hooks({'pre_app': [entry]}) @pytest.mark.parametrize('extra_key', ['name', 'singularity_args']) def test_unknown_key_in_script_entry_raises(extra_key): # `name` is rejected too: there is no override in this version. - cfg = {'pre_app': [{'script': 'x.sh', extra_key: 'whatever'}]} + cfg = {'pre_app': [{'script': '/x.sh', extra_key: 'whatever'}]} with pytest.raises(ValueError, match='Unsupported key'): - resolve_hooks(cfg, source_base='/base') + resolve_hooks(cfg) -@pytest.mark.parametrize('bad_source', ['some/dir/', '..', '.']) +@pytest.mark.parametrize('bad_source', ['/some/dir/', '..', '.']) def test_source_with_invalid_derived_name_raises(bad_source): # The destination name is the source basename; a source whose basename is # empty or '.'/'..' has no usable name. cfg = {'pre_app': [{'script': bad_source}]} with pytest.raises(ValueError, match='Invalid hook name'): - resolve_hooks(cfg, source_base='/base') + resolve_hooks(cfg) def test_render_is_defined_but_never_produced(): @@ -139,7 +134,7 @@ def test_render_is_defined_but_never_produced(): assert r.analysis_path == 'code/hooks/zip.sh' # ...but resolve_hooks never returns one in this version (no config form maps to it). _, _, materializations = resolve_hooks( - {'post_run': ['echo verbatim', {'script': 'x.sh'}]}, source_base='/base' + {'post_run': ['echo verbatim', {'script': '/x.sh'}]} ) assert all(isinstance(m, CopyIn) for m in materializations) From bfe27aa564e20d24c60591381c5742311d9dc554 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 12:27:54 -0500 Subject: [PATCH 05/16] hooks: wire resolve_hooks into single-app init (materialize + splice) Connect the resolver to babs for the single-app codepath: - Render: Container.generate_bash_participant_job resolves `self.config.get('hooks')` and passes hook_pre_app/hook_post_run to generate_submit_script (container.py). - Materialize: babs_bootstrap appends the resolved CopyIns' as_import() dicts to the existing shared _init_import_files call, so hook scripts copy into code/hooks/ with no duplicated import logic. - _init_import_files now makedirs the destination's parent (hooks land in code/hooks/, which -- unlike flat code/ -- doesn't pre-exist). The pipeline call site (bootstrap.py:508) is left unwired by design: it defaults hook_pre_app=None so pipeline renders byte-identical, and pipeline mode is slated for deletion once NORDIC is a hook. No hooks configured => no code/hooks/ dir and no splice subshell (both are guarded), so projects without hooks are untouched. Tests: - test_babs_init_raw_bids: splice a form-(b) contract-guard hook at both pre_app and post_run. It runs as a separate process, so it only sees the contract vars (subid/BRANCH/PROJECT_ROOT/JOB_SCRATCH_DIR) because the splice subshell exports them; `${var:?}` fails the job under set -e if any is unset. Same source at both points also exercises copy-once dedup. Plus a post-init assertion that the guard is materialized + spliced -- a dropped hook would otherwise pass silently (no guard left to fail). - test_babs_init_single_app_hooks: init-only wiring check (no job execution) -- asserts a configured hook lands in code/hooks/ and is referenced twice. - test_babs_init_list_sub_file: assert no code/hooks/ dir when no hooks. Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/bootstrap.py | 15 +++++- babs/container.py | 4 ++ tests/test_babs_workflow.py | 91 +++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/babs/bootstrap.py b/babs/bootstrap.py index 3827bff6..5d448dc9 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -13,6 +13,7 @@ from babs.base import BABS from babs.container import Container +from babs.hooks import resolve_hooks from babs.input_datasets import InputDatasets from babs.status import create_initial_statuses, write_job_status_csv from babs.system import System, validate_queue @@ -270,8 +271,15 @@ def babs_bootstrap( ) container = Container(container_ds, container_name, container_config) - # Copy in any other files needed: - self._init_import_files(container.config.get('imported_files', [])) + # Copy in any other files needed, plus form-(b) hook scripts. Hook + # CopyIns reuse the imported_files path (destination relative to + # self.analysis_path, so it survives a configurable analysis_path). + # Pipeline configs have no `hooks:` block, so this is a no-op there. + _, _, hook_materializations = resolve_hooks(container.config.get('hooks')) + self._init_import_files( + container.config.get('imported_files', []) + + [m.as_import() for m in hook_materializations] + ) # _update_inclusion_dataframe() expects a DataFrame (or None). # If --list_sub_file was provided, use the parsed DataFrame # stored in initial_inclu_df by set_inclusion_dataframe() above. @@ -564,6 +572,9 @@ def _init_import_files(self, file_list): f'Requested imported file {imported_file["original_path"]} does not exist.' ) imported_location = op.join(self.analysis_path, imported_file['analysis_path']) + # Create the destination's parent dir if needed (e.g. hooks land in + # `code/hooks/`, which doesn't pre-exist like flat `code/` does). + os.makedirs(op.dirname(imported_location), exist_ok=True) # Copy the file using pure Python: with ( open(imported_file['original_path'], 'rb') as src, diff --git a/babs/container.py b/babs/container.py index 4424b833..83e5c102 100644 --- a/babs/container.py +++ b/babs/container.py @@ -8,6 +8,7 @@ from babs.generate_bidsapp_runscript import generate_bidsapp_runscript from babs.generate_submit_script import generate_submit_script, generate_test_submit_script +from babs.hooks import resolve_hooks from babs.utils import app_output_settings_from_config @@ -178,6 +179,7 @@ def generate_bash_participant_job( If True, align generated script permissions with shared-group mode. """ + hook_pre_app, hook_post_run, _ = resolve_hooks(self.config.get('hooks')) script_content = generate_submit_script( queue_system=system.type, cluster_resources_config=self.config['cluster_resources'], @@ -188,6 +190,8 @@ def generate_bash_participant_job( container_name=self.container_name, zip_foldernames=self.config['zip_foldernames'], project_root=project_root, + hook_pre_app=hook_pre_app, + hook_post_run=hook_post_run, ) with open(bash_path, 'w') as f: diff --git a/tests/test_babs_workflow.py b/tests/test_babs_workflow.py index 934c57ef..cb611bca 100644 --- a/tests/test_babs_workflow.py +++ b/tests/test_babs_workflow.py @@ -11,6 +11,7 @@ import pandas as pd import pytest +import yaml from conftest import ( ensure_container_image, gather_slurm_job_diagnostics, @@ -68,6 +69,29 @@ def test_babs_init_raw_bids( }, ) + # Splice a contract-guard hook at both splice points. It's a form-(b) + # `script:` (a separate process), so it only sees the contract vars because + # the splice subshell exports them; `${var:?}` fails the job under `set -e` + # if any guaranteed var is unset -- so this e2e goes red if a refactor ever + # breaks the splice contract. Reusing one source at pre_app + post_run also + # exercises copy-once dedup. (sesid is session-only, so it's not guarded + # here; its export is covered by the render-level test.) + contract_guard = project_base / 'contract_guard.sh' + contract_guard.write_text( + ': "${subid:?contract guard: subid not exported}"\n' + ': "${BRANCH:?contract guard: BRANCH not exported}"\n' + ': "${PROJECT_ROOT:?contract guard: PROJECT_ROOT not exported}"\n' + ': "${JOB_SCRATCH_DIR:?contract guard: JOB_SCRATCH_DIR not exported}"\n' + ) + with open(container_config) as f: + cfg = yaml.safe_load(f) + cfg['hooks'] = { + 'pre_app': [{'script': str(contract_guard)}], + 'post_run': [{'script': str(contract_guard)}], + } + with open(container_config, 'w') as f: + yaml.safe_dump(cfg, f) + babs_init_opts = argparse.Namespace( project_root=project_root, list_sub_file=None, @@ -97,6 +121,14 @@ def test_babs_init_raw_bids( with mock.patch.object(argparse.ArgumentParser, 'parse_args', return_value=babs_init_opts): _enter_init() + # The contract guard can only fail the job if it is actually wired in -- a + # dropped hook would leave no assertion to fail. So verify positively that + # the hook is materialized and spliced at both points before relying on it. + analysis_code = project_root / 'analysis' / 'code' + assert (analysis_code / 'hooks' / 'contract_guard.sh').exists() + participant_job = (analysis_code / 'participant_job.sh').read_text() + assert participant_job.count('bash ./code/hooks/contract_guard.sh') == 2 + # babs check-setup: babs_check_setup_opts = argparse.Namespace(project_root=project_root, job_test=True) with mock.patch.object( @@ -217,6 +249,62 @@ def _get_results_branches_use_merge_ds_when_exists(self): raise +@pytest.mark.timeout(300) +def test_babs_init_single_app_hooks( + tmp_path_factory, + bids_data_singlesession, + simbids_container_ds, +): + """`babs init` materializes hook scripts and splices them at both splice points. + + This is the wiring check that backstops the runtime contract-guard hook in + test_babs_init_raw_bids: that guard can only fail the job if it is actually + in place, so a silently dropped hook would pass. Here we assert positively -- + no job execution needed -- that a configured hook is copied into code/hooks/ + and referenced from participant_job.sh at both pre_app and post_run. + """ + project_base = tmp_path_factory.mktemp('hooks_project') + project_root = project_base / 'my_babs_project' + + container_config = update_yaml_for_run( + project_base, + get_config_simbids_path().name, + {'BIDS': bids_data_singlesession}, + ) + hook = project_base / 'echo_hook.sh' + hook.write_text('echo hook-ran\n') + with open(container_config) as f: + cfg = yaml.safe_load(f) + cfg['hooks'] = { + 'pre_app': [{'script': str(hook)}], + 'post_run': [{'script': str(hook)}], + } + with open(container_config, 'w') as f: + yaml.safe_dump(cfg, f) + + babs_init_opts = argparse.Namespace( + project_root=project_root, + list_sub_file=None, + container_ds=simbids_container_ds, + container_name='simbids-0-0-3', + container_config=container_config, + processing_level='subject', + queue='slurm', + keep_if_failed=False, + ) + with mock.patch.object(argparse.ArgumentParser, 'parse_args', return_value=babs_init_opts): + _enter_init() + + analysis_code = project_root / 'analysis' / 'code' + # Materialized once into code/hooks/ (same source at both points -> copy-once): + hook_in_ds = analysis_code / 'hooks' / 'echo_hook.sh' + assert hook_in_ds.exists() + assert hook_in_ds.read_text() == 'echo hook-ran\n' + # Spliced at both pre_app and post_run: + participant_job = (analysis_code / 'participant_job.sh').read_text() + assert participant_job.count('bash ./code/hooks/echo_hook.sh') == 2 + + def test_init_forwards_shared_group(tmp_path): """Test that CLI --shared-group is forwarded to bootstrap.""" options = argparse.Namespace( @@ -312,6 +400,9 @@ def test_babs_init_list_sub_file( _enter_init() assert project_root.exists() + # No hooks configured -> no code/hooks/ dir is created (it's materialized + # lazily, only when a hook actually needs it). + assert not (project_root / 'analysis' / 'code' / 'hooks').exists() inclusion_csv = project_root / 'analysis' / 'code' / 'processing_inclusion.csv' assert inclusion_csv.exists() df = pd.read_csv(inclusion_csv) From 9712f7c29890cc9e85815cbb50a3d971ae7548ad Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 13:52:15 -0500 Subject: [PATCH 06/16] hooks: name the entry forms descriptively, drop (a)/(b)/(c) labels Replace the alphabetic form labels in docstrings/comments with names derived from the config syntax, so a reader doesn't have to decode "(a)/(b)/(c)": (a) raw snippet -> snippet (Verbatim) (b) user script -> script (CopyIn) (c) container -> templated built-in (Render) "templated built-in" names the Render *mode* (babs renders a shipped *.sh.jinja2 into code/hooks/), which serves both a zip hook and a container-running hook (e.g. nordic) alike -- they differ only in what the shipped template does, not in how it's produced. So there is no distinct "container" form; a container-running hook is just a {builtin: } whose template composes a singularity run. Comments/docstrings only -- no behavior change. Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/bootstrap.py | 2 +- babs/hooks.py | 19 ++++++++++--------- tests/test_babs_workflow.py | 4 ++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/babs/bootstrap.py b/babs/bootstrap.py index 5d448dc9..e4921b34 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -271,7 +271,7 @@ def babs_bootstrap( ) container = Container(container_ds, container_name, container_config) - # Copy in any other files needed, plus form-(b) hook scripts. Hook + # Copy in any other files needed, plus script hooks. Hook # CopyIns reuse the imported_files path (destination relative to # self.analysis_path, so it survives a configurable analysis_path). # Pipeline configs have no `hooks:` block, so this is a no-op there. diff --git a/babs/hooks.py b/babs/hooks.py index fdf6003c..dc125623 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -14,8 +14,8 @@ Entry forms supported in this version: -- **(a) raw snippet** -- a bare string, spliced verbatim (``Verbatim``). -- **(b) user script** -- ``{script: }``; copied into +- **snippet** -- a bare string, spliced verbatim (``Verbatim``). +- **script** -- ``{script: }``; copied into ``code/hooks/.sh`` and invoked as ``bash ./code/hooks/.sh`` (``CopyIn``). ```` is an absolute local path used verbatim -- the same convention as ``imported_files.original_path``, which this reuses. The @@ -24,10 +24,11 @@ it is copied once and referenced from each. Two *different* sources sharing a basename collide (no name override yet -- add one if needed). -`Render` (rendering a shipped ``*.sh.jinja2`` through the shared singularity -partial) is defined here as the forward-compatible seam, but `resolve_hooks` -does not yet produce one -- entries requiring it are rejected. See the -pipeline-of-one design notes for the builtin/container forms it will carry. +`Render` (rendering a shipped ``*.sh.jinja2`` into ``code/hooks/``) is defined +here as the forward-compatible seam, but `resolve_hooks` does not yet produce one +-- entries requiring it are rejected. It is the **templated built-in** mode: the +same mechanism serves a zip hook and a container-running hook (e.g. nordic) alike +-- they differ only in what the shipped template does, not in how it's produced. """ import os.path as op @@ -42,14 +43,14 @@ @dataclass(frozen=True) class Verbatim: - """Form (a): a raw shell snippet spliced inline. No file materialization.""" + """Snippet hook: a raw shell snippet spliced inline. No file materialization.""" command: str @dataclass(frozen=True) class CopyIn: - """Form (b) / static built-in: copy ``original_path`` -> ``code/hooks/.sh``.""" + """Script hook (or static built-in): copy ``original_path`` -> ``code/hooks/.sh``.""" original_path: str name: str @@ -66,7 +67,7 @@ def as_import(self): @dataclass(frozen=True) class Render: - """Form (c) / templated built-in: render ``template_path`` -> ``code/hooks/.sh``. + """Templated built-in: render ``template_path`` -> ``code/hooks/.sh``. Defined as the forward-compatible seam; `resolve_hooks` does not yet produce one (wired in a later step alongside the shared singularity partial). diff --git a/tests/test_babs_workflow.py b/tests/test_babs_workflow.py index cb611bca..4db7d5aa 100644 --- a/tests/test_babs_workflow.py +++ b/tests/test_babs_workflow.py @@ -69,8 +69,8 @@ def test_babs_init_raw_bids( }, ) - # Splice a contract-guard hook at both splice points. It's a form-(b) - # `script:` (a separate process), so it only sees the contract vars because + # Splice a contract-guard hook at both splice points. It's a script hook + # (`script:`, a separate process), so it only sees the contract vars because # the splice subshell exports them; `${var:?}` fails the job under `set -e` # if any guaranteed var is unset -- so this e2e goes red if a refactor ever # breaks the splice contract. Reusing one source at pre_app + post_run also From 79f1c40bbfab4f80c5c35782d2eecc3ffe270503 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 14:32:09 -0500 Subject: [PATCH 07/16] hooks: rename splice point pre_app -> pre_run (symmetry with post_run) The two splice points bracket the same `datalad run`, so name them for that boundary: pre_run / post_run (was pre_app / post_run, which mixed "app" and "run" vocab). Renames the config key, the hook_pre_app -> hook_pre_run template param, and the test markers. Config-key churn is cheapest now, before the keys are public. (Review feedback, PR1-review Q8.) Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/container.py | 4 +-- babs/generate_submit_script.py | 6 ++-- babs/hooks.py | 16 ++++----- babs/templates/participant_job.sh.jinja2 | 6 ++-- tests/test_babs_workflow.py | 10 +++--- tests/test_generate_submit_script.py | 32 +++++++++--------- tests/test_hooks.py | 42 ++++++++++++------------ 7 files changed, 58 insertions(+), 58 deletions(-) diff --git a/babs/container.py b/babs/container.py index 83e5c102..f785cd9f 100644 --- a/babs/container.py +++ b/babs/container.py @@ -179,7 +179,7 @@ def generate_bash_participant_job( If True, align generated script permissions with shared-group mode. """ - hook_pre_app, hook_post_run, _ = resolve_hooks(self.config.get('hooks')) + hook_pre_run, hook_post_run, _ = resolve_hooks(self.config.get('hooks')) script_content = generate_submit_script( queue_system=system.type, cluster_resources_config=self.config['cluster_resources'], @@ -190,7 +190,7 @@ def generate_bash_participant_job( container_name=self.container_name, zip_foldernames=self.config['zip_foldernames'], project_root=project_root, - hook_pre_app=hook_pre_app, + hook_pre_run=hook_pre_run, hook_post_run=hook_post_run, ) diff --git a/babs/generate_submit_script.py b/babs/generate_submit_script.py index 1f4783e9..f6f16517 100644 --- a/babs/generate_submit_script.py +++ b/babs/generate_submit_script.py @@ -30,7 +30,7 @@ def generate_submit_script( container_images=None, datalad_run_message=None, project_root=None, - hook_pre_app=None, + hook_pre_run=None, hook_post_run=None, ): """ @@ -64,7 +64,7 @@ def generate_submit_script( Absolute path to the BABS project root (parent of `analysis/`). Passed to the template; used in the error message when PROJECT_ROOT is unset. If None, the placeholder ``{project_root}`` is shown. - hook_pre_app : list of str, optional + hook_pre_run : list of str, optional Shell snippets spliced into a subshell just before the ``datalad run`` wrapper. None (or empty) renders nothing. Snippets are emitted in order. hook_post_run : list of str, optional @@ -137,7 +137,7 @@ def generate_submit_script( container_image_paths=container_image_paths, datalad_run_message=datalad_run_message, project_root=project_root, - hook_pre_app=hook_pre_app, + hook_pre_run=hook_pre_run, hook_post_run=hook_post_run, ) diff --git a/babs/hooks.py b/babs/hooks.py index dc125623..be2ea133 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -1,7 +1,7 @@ """Resolve a `hooks:` config block into spliceable commands + materializations. A BABS container config may carry a top-level ``hooks:`` block with -``pre_app:`` / ``post_run:`` lists. Each entry names a snippet of work to run +``pre_run:`` / ``post_run:`` lists. Each entry names a snippet of work to run at the corresponding splice point in ``participant_job.sh`` (added in the splice-points step). This module turns those config entries into: @@ -20,7 +20,7 @@ (``CopyIn``). ```` is an absolute local path used verbatim -- the same convention as ``imported_files.original_path``, which this reuses. The destination name is the source basename. The *same* script may appear at - multiple splice points (e.g. a validator at ``pre_app`` and ``post_run``) -- + multiple splice points (e.g. a validator at ``pre_run`` and ``post_run``) -- it is copied once and referenced from each. Two *different* sources sharing a basename collide (no name override yet -- add one if needed). @@ -38,7 +38,7 @@ HOOKS_SUBDIR = op.join('code', 'hooks') # The splice points a hooks config may target, in run order. -SPLICE_POINTS = ('pre_app', 'post_run') +SPLICE_POINTS = ('pre_run', 'post_run') @dataclass(frozen=True) @@ -104,7 +104,7 @@ def _resolve_entry(entry): Parameters ---------- entry : str or dict - One item from a ``pre_app:`` / ``post_run:`` list. + One item from a ``pre_run:`` / ``post_run:`` list. Returns ------- @@ -152,13 +152,13 @@ def resolve_hooks(hooks_config): Parameters ---------- hooks_config : dict or None - The top-level ``hooks:`` block (``{'pre_app': [...], 'post_run': [...]}``), + The top-level ``hooks:`` block (``{'pre_run': [...], 'post_run': [...]}``), or ``None`` when no hooks are configured. Returns ------- - pre_app : list of str - Command strings to splice at the ``pre_app`` point, in order. + pre_run : list of str + Command strings to splice at the ``pre_run`` point, in order. post_run : list of str Command strings to splice at the ``post_run`` point, in order. materializations : list of (CopyIn or Render) @@ -215,4 +215,4 @@ def resolve_hooks(hooks_config): seen[mode.name] = (mode, point) materializations.append(mode) - return resolved['pre_app'], resolved['post_run'], materializations + return resolved['pre_run'], resolved['post_run'], materializations diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index eac3ee66..7cd68d70 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -130,14 +130,14 @@ for CONTAINER_JOB in "${CONTAINER_IMAGE_PATHS[@]}"; do exit 1 fi done -{% if hook_pre_app %} -# pre_app hooks: spliced before the datalad run wrapper; subshell exports the contract. +{% if hook_pre_run %} +# pre_run hooks: spliced before the datalad run wrapper; subshell exports the contract. ( export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR {% if processing_level == 'session' %} export sesid {% endif %} -{% for snippet in hook_pre_app %} +{% for snippet in hook_pre_run %} {{ snippet }} {% endfor %} ) diff --git a/tests/test_babs_workflow.py b/tests/test_babs_workflow.py index 4db7d5aa..3479b03a 100644 --- a/tests/test_babs_workflow.py +++ b/tests/test_babs_workflow.py @@ -73,7 +73,7 @@ def test_babs_init_raw_bids( # (`script:`, a separate process), so it only sees the contract vars because # the splice subshell exports them; `${var:?}` fails the job under `set -e` # if any guaranteed var is unset -- so this e2e goes red if a refactor ever - # breaks the splice contract. Reusing one source at pre_app + post_run also + # breaks the splice contract. Reusing one source at pre_run + post_run also # exercises copy-once dedup. (sesid is session-only, so it's not guarded # here; its export is covered by the render-level test.) contract_guard = project_base / 'contract_guard.sh' @@ -86,7 +86,7 @@ def test_babs_init_raw_bids( with open(container_config) as f: cfg = yaml.safe_load(f) cfg['hooks'] = { - 'pre_app': [{'script': str(contract_guard)}], + 'pre_run': [{'script': str(contract_guard)}], 'post_run': [{'script': str(contract_guard)}], } with open(container_config, 'w') as f: @@ -261,7 +261,7 @@ def test_babs_init_single_app_hooks( test_babs_init_raw_bids: that guard can only fail the job if it is actually in place, so a silently dropped hook would pass. Here we assert positively -- no job execution needed -- that a configured hook is copied into code/hooks/ - and referenced from participant_job.sh at both pre_app and post_run. + and referenced from participant_job.sh at both pre_run and post_run. """ project_base = tmp_path_factory.mktemp('hooks_project') project_root = project_base / 'my_babs_project' @@ -276,7 +276,7 @@ def test_babs_init_single_app_hooks( with open(container_config) as f: cfg = yaml.safe_load(f) cfg['hooks'] = { - 'pre_app': [{'script': str(hook)}], + 'pre_run': [{'script': str(hook)}], 'post_run': [{'script': str(hook)}], } with open(container_config, 'w') as f: @@ -300,7 +300,7 @@ def test_babs_init_single_app_hooks( hook_in_ds = analysis_code / 'hooks' / 'echo_hook.sh' assert hook_in_ds.exists() assert hook_in_ds.read_text() == 'echo hook-ran\n' - # Spliced at both pre_app and post_run: + # Spliced at both pre_run and post_run: participant_job = (analysis_code / 'participant_job.sh').read_text() assert participant_job.count('bash ./code/hooks/echo_hook.sh') == 2 diff --git a/tests/test_generate_submit_script.py b/tests/test_generate_submit_script.py index ebf07087..e271bc2b 100644 --- a/tests/test_generate_submit_script.py +++ b/tests/test_generate_submit_script.py @@ -117,10 +117,10 @@ def test_generate_submit_script(input_datasets, config_file, processing_level, t # --------------------------------------------------------------------------- -# pre_app / post_run splice-point hooks +# pre_run / post_run splice-point hooks # --------------------------------------------------------------------------- -PRE_APP_MARKER = '# pre_app hooks:' +PRE_RUN_MARKER = '# pre_run hooks:' POST_RUN_MARKER = '# post_run hooks:' CONTRACT_EXPORT = 'export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR' @@ -141,19 +141,19 @@ def _render_with_hooks(processing_level='subject', **hook_kwargs): ) -def _pre_app_block(text): - """Slice out the rendered pre_app subshell (marker through its closing paren).""" - start = text.index(PRE_APP_MARKER) +def _pre_run_block(text): + """Slice out the rendered pre_run subshell (marker through its closing paren).""" + start = text.index(PRE_RUN_MARKER) return text[start : text.index('\n)\n', start) + 2] def test_hooks_absent_when_not_configured(tmp_path): """No hooks configured: no splice blocks rendered, and None behaves like [].""" default_render = _render_with_hooks() - empty_render = _render_with_hooks(hook_pre_app=[], hook_post_run=[]) + empty_render = _render_with_hooks(hook_pre_run=[], hook_post_run=[]) assert default_render == empty_render # None and [] are both no-ops - assert PRE_APP_MARKER not in default_render + assert PRE_RUN_MARKER not in default_render assert POST_RUN_MARKER not in default_render out_fn = tmp_path / 'participant_job_nohooks.sh' @@ -162,15 +162,15 @@ def test_hooks_absent_when_not_configured(tmp_path): assert passed, status -def test_pre_app_and_post_run_hooks_spliced(tmp_path): +def test_pre_run_and_post_run_hooks_spliced(tmp_path): """Configured hooks render as subshells exporting the contract, in order, positioned around the datalad run wrapper.""" text = _render_with_hooks( - hook_pre_app=['echo PRE_A', 'echo PRE_B'], + hook_pre_run=['echo PRE_A', 'echo PRE_B'], hook_post_run=['echo POST_ONE'], ) - assert PRE_APP_MARKER in text + assert PRE_RUN_MARKER in text assert POST_RUN_MARKER in text # the contract is exported once per splice subshell assert text.count(CONTRACT_EXPORT) == 2 @@ -178,8 +178,8 @@ def test_pre_app_and_post_run_hooks_spliced(tmp_path): assert text.index('echo PRE_A') < text.index('echo PRE_B') assert 'echo POST_ONE' in text - # pre_app sits before the run; post_run after it, before the push - i_pre = text.index(PRE_APP_MARKER) + # pre_run sits before the run; post_run after it, before the push + i_pre = text.index(PRE_RUN_MARKER) i_run = text.index('\ndatalad run ') i_post = text.index(POST_RUN_MARKER) i_push = text.index('datalad push --to output-storage') @@ -195,11 +195,11 @@ def test_pre_app_and_post_run_hooks_spliced(tmp_path): def test_hooks_export_sesid_only_at_session_level(): """sesid joins the exported contract only for session-level processing.""" - session = _render_with_hooks(processing_level='session', hook_pre_app=['echo HI']) - subject = _render_with_hooks(processing_level='subject', hook_pre_app=['echo HI']) + session = _render_with_hooks(processing_level='session', hook_pre_run=['echo HI']) + subject = _render_with_hooks(processing_level='subject', hook_pre_run=['echo HI']) - assert 'export sesid' in _pre_app_block(session) - assert 'export sesid' not in _pre_app_block(subject) + assert 'export sesid' in _pre_run_block(session) + assert 'export sesid' not in _pre_run_block(subject) def run_shellcheck(script_path): diff --git a/tests/test_hooks.py b/tests/test_hooks.py index 8cc2def9..e24c2759 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -19,14 +19,14 @@ def test_empty_config_resolves_empty(): def test_missing_or_empty_lists_resolve_empty(): - cfg = {'pre_app': None, 'post_run': []} + cfg = {'pre_run': None, 'post_run': []} assert resolve_hooks(cfg) == ([], [], []) def test_raw_snippet_is_verbatim_with_no_materialization(): - cfg = {'pre_app': ['echo hello']} - pre_app, post_run, materializations = resolve_hooks(cfg) - assert pre_app == ['echo hello'] + cfg = {'pre_run': ['echo hello']} + pre_run, post_run, materializations = resolve_hooks(cfg) + assert pre_run == ['echo hello'] assert post_run == [] assert materializations == [] @@ -34,9 +34,9 @@ def test_raw_snippet_is_verbatim_with_no_materialization(): def test_script_path_used_verbatim(): # `script:` is an absolute local path used as-is (like imported_files); # the destination name is its basename. - cfg = {'pre_app': [{'script': '/proj/hooks/validate.sh'}]} - pre_app, _, materializations = resolve_hooks(cfg) - assert pre_app == ['bash ./code/hooks/validate.sh'] + cfg = {'pre_run': [{'script': '/proj/hooks/validate.sh'}]} + pre_run, _, materializations = resolve_hooks(cfg) + assert pre_run == ['bash ./code/hooks/validate.sh'] assert materializations == [ CopyIn(original_path='/proj/hooks/validate.sh', name='validate') ] @@ -48,24 +48,24 @@ def test_script_path_used_verbatim(): def test_default_name_only_strips_trailing_sh(): # a source without a .sh suffix keeps its basename as the name - cfg = {'pre_app': [{'script': '/tools/runme'}]} - pre_app, _, _ = resolve_hooks(cfg) - assert pre_app == ['bash ./code/hooks/runme.sh'] + cfg = {'pre_run': [{'script': '/tools/runme'}]} + pre_run, _, _ = resolve_hooks(cfg) + assert pre_run == ['bash ./code/hooks/runme.sh'] def test_order_preserved_within_and_across_splice_points(): cfg = { - 'pre_app': ['echo a', 'echo b'], + 'pre_run': ['echo a', 'echo b'], 'post_run': ['echo c'], } - pre_app, post_run, _ = resolve_hooks(cfg) - assert pre_app == ['echo a', 'echo b'] + pre_run, post_run, _ = resolve_hooks(cfg) + assert pre_run == ['echo a', 'echo b'] assert post_run == ['echo c'] def test_different_sources_same_name_collide(): cfg = { - 'pre_app': [{'script': '/a/validate.sh'}], + 'pre_run': [{'script': '/a/validate.sh'}], 'post_run': [{'script': '/b/validate.sh'}], } with pytest.raises(ValueError, match='Duplicate hook name'): @@ -73,14 +73,14 @@ def test_different_sources_same_name_collide(): def test_same_script_at_both_points_materializes_once(): - # The identical hook reused at pre_app and post_run (e.g. a validator) is + # The identical hook reused at pre_run and post_run (e.g. a validator) is # copied once and referenced from each list -- not a collision. cfg = { - 'pre_app': [{'script': '/proj/hooks/validate.sh'}], + 'pre_run': [{'script': '/proj/hooks/validate.sh'}], 'post_run': [{'script': '/proj/hooks/validate.sh'}], } - pre_app, post_run, materializations = resolve_hooks(cfg) - assert pre_app == ['bash ./code/hooks/validate.sh'] + pre_run, post_run, materializations = resolve_hooks(cfg) + assert pre_run == ['bash ./code/hooks/validate.sh'] assert post_run == ['bash ./code/hooks/validate.sh'] assert materializations == [ CopyIn(original_path='/proj/hooks/validate.sh', name='validate') @@ -108,13 +108,13 @@ def test_non_mapping_config_raises(): @pytest.mark.parametrize('entry', [{'builtin': 'zip'}, {'container': 'nordic'}, 42]) def test_unsupported_entry_forms_raise(entry): with pytest.raises(ValueError, match='Unsupported hook entry'): - resolve_hooks({'pre_app': [entry]}) + resolve_hooks({'pre_run': [entry]}) @pytest.mark.parametrize('extra_key', ['name', 'singularity_args']) def test_unknown_key_in_script_entry_raises(extra_key): # `name` is rejected too: there is no override in this version. - cfg = {'pre_app': [{'script': '/x.sh', extra_key: 'whatever'}]} + cfg = {'pre_run': [{'script': '/x.sh', extra_key: 'whatever'}]} with pytest.raises(ValueError, match='Unsupported key'): resolve_hooks(cfg) @@ -123,7 +123,7 @@ def test_unknown_key_in_script_entry_raises(extra_key): def test_source_with_invalid_derived_name_raises(bad_source): # The destination name is the source basename; a source whose basename is # empty or '.'/'..' has no usable name. - cfg = {'pre_app': [{'script': bad_source}]} + cfg = {'pre_run': [{'script': bad_source}]} with pytest.raises(ValueError, match='Invalid hook name'): resolve_hooks(cfg) From fc09636f3684c38d6e1c9ece21feea7e0daf683d Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 14:34:55 -0500 Subject: [PATCH 08/16] docs: document the hooks config section Add a `hooks` section to the config-yaml docs: the pre_run/post_run splice points, the snippet and script entry forms, the exported runtime contract (subid/sesid/BRANCH/PROJECT_ROOT/JOB_SCRATCH_DIR) + subshell/set -e semantics, and -- most importantly -- that hooks splice OUTSIDE the datalad run wrapper, so a hook that writes files leaves uncommitted changes that aren't pushed; the safe default is validation that fails the job, and persisting output requires the hook to own its own datalad run/save. (Review feedback, PR1-review Q1/Q2/Q10/Q11.) Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/preparation_config_yaml_file.rst | 56 +++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/docs/preparation_config_yaml_file.rst b/docs/preparation_config_yaml_file.rst index 767a95ac..a0e102b7 100644 --- a/docs/preparation_config_yaml_file.rst +++ b/docs/preparation_config_yaml_file.rst @@ -162,6 +162,62 @@ It also means that the ``recon_spec.yaml`` file will be tracked by datalad. **Important**: If you are importing a large file this mechanism will not work. +Section ``hooks`` +================= + +This section is optional. It lets you splice your own shell commands into each +participant job at two **splice points** that bracket the BIDS App run: + +- ``pre_run`` — runs just **before** the ``datalad run`` that executes the BIDS App. +- ``post_run`` — runs just **after** that ``datalad run`` completes (its outputs + already committed), but **before** the results are pushed to the output RIA. + +Each splice point is a list; entries run in the order you list them. + +Example section **hooks** +------------------------- + +.. code-block:: yaml + + hooks: + pre_run: + - echo "starting ${subid}" # a raw shell snippet + - script: "/path/to/validate-inputs.sh" # a script copied into the project + post_run: + - script: "/path/to/validate-outputs.sh" + +Two entry forms are supported: + +- **snippet** — a bare string. It is spliced **verbatim** into the job script + and runs *inline*. You own its quoting and safety (this is shell injection by + the config author, by design). +- **script** — ``{script: }``. ```` is an **absolute** local path + (copied into the project the same way as ``imported_files``). BABS copies it to + ``code/hooks/.sh`` at ``babs init`` and the splice runs + ``bash ./code/hooks/.sh`` — a **separate process**. + +The runtime contract +-------------------- + +Both splice points run with the working directory set to the cloned dataset and +with these variables **exported** (so a separate-process ``script`` hook can read +them): ``subid``; ``sesid`` (session-level processing only); ``BRANCH``; +``PROJECT_ROOT``; ``JOB_SCRATCH_DIR``. Each splice runs in a **subshell** under +``set -e``: a hook's ``cd`` or variable changes do **not** leak into the rest of +the job, and a non-zero exit **aborts the job**. + +Persisting hook output +---------------------- + +Hooks splice **outside** the ``datalad run`` wrapper, so their filesystem effects +are **not** captured by it. A ``post_run`` hook that merely writes files into the +dataset leaves **uncommitted** changes — ``datalad push`` ships committed content, +so those files are **not pushed**. The common, safe use is **validation that fails +the job**: a ``post_run`` validator that exits non-zero aborts the job *before* +the push, so bad results never leave the node. If a hook needs to **persist** +output, it must run its **own** ``datalad run``/``datalad save`` to commit it. + + Section ``bids_app_args`` ========================= Currently, BABS does not support using configurations of running a BIDS App From a3fb7cf65a0ea8cd02432c03397c2a950e8c4066 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 14:47:42 -0500 Subject: [PATCH 09/16] hooks: fix collision message for a same-splice-point clash Two different scripts sharing a basename within one splice point produced "(in 'pre_run' and 'pre_run')". Name the single point when prior_point == point, the pair otherwise. Adds test_same_point_same_name_collide and tightens the cross-point test to assert the full location. (Review feedback, PR1-review Q12.) Co-Authored-By: Claude Opus 4.8 (1M context) --- babs/hooks.py | 6 +++--- tests/test_hooks.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/babs/hooks.py b/babs/hooks.py index be2ea133..ed7267ea 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -207,10 +207,10 @@ def resolve_hooks(hooks_config): prior, prior_point = seen[mode.name] if mode == prior: continue # same file; already materialized + where = repr(point) if prior_point == point else f'{prior_point!r} and {point!r}' raise ValueError( - f'Duplicate hook name {mode.name!r} (in {prior_point!r} and ' - f'{point!r}) resolves to different content; both would write ' - f'{op.join(HOOKS_SUBDIR, mode.name + ".sh")!r}.' + f'Duplicate hook name {mode.name!r} ({where}): two different ' + f'hooks both resolve to {op.join(HOOKS_SUBDIR, mode.name + ".sh")!r}.' ) seen[mode.name] = (mode, point) materializations.append(mode) diff --git a/tests/test_hooks.py b/tests/test_hooks.py index e24c2759..f9d1d480 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -68,7 +68,15 @@ def test_different_sources_same_name_collide(): 'pre_run': [{'script': '/a/validate.sh'}], 'post_run': [{'script': '/b/validate.sh'}], } - with pytest.raises(ValueError, match='Duplicate hook name'): + with pytest.raises(ValueError, match=r"Duplicate hook name 'validate' \('pre_run' and 'post_run'\)"): + resolve_hooks(cfg) + + +def test_same_point_same_name_collide(): + # Two different scripts with the same basename in one splice point: the + # message names the single point, not "'pre_run' and 'pre_run'". + cfg = {'pre_run': [{'script': '/a/validate.sh'}, {'script': '/b/validate.sh'}]} + with pytest.raises(ValueError, match=r"Duplicate hook name 'validate' \('pre_run'\)"): resolve_hooks(cfg) From 694d7a6ff9353b77d6ef942b95fe868fe2795bbd Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 20:15:34 -0500 Subject: [PATCH 10/16] hooks: resolve {builtin: } entries to Render (zip's config form) A {builtin: } entry now resolves to a Render of the shipped templates/hooks/.sh.jinja2; keys beyond `builtin` become the per-hook part of the render context (e.g. zip's optional `path`). Bootstrap still gates with NotImplementedError until the next step wires the actual rendering; CopyIns keep flowing to _init_import_files. Co-Authored-By: Claude Fable 5 --- babs/bootstrap.py | 13 ++++++++++-- babs/hooks.py | 41 ++++++++++++++++++++++++++---------- tests/test_hooks.py | 51 +++++++++++++++++++++++++++++++++++---------- 3 files changed, 81 insertions(+), 24 deletions(-) diff --git a/babs/bootstrap.py b/babs/bootstrap.py index e4921b34..de5dcf94 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -13,7 +13,7 @@ from babs.base import BABS from babs.container import Container -from babs.hooks import resolve_hooks +from babs.hooks import CopyIn, Render, resolve_hooks from babs.input_datasets import InputDatasets from babs.status import create_initial_statuses, write_job_status_csv from babs.system import System, validate_queue @@ -276,9 +276,18 @@ def babs_bootstrap( # self.analysis_path, so it survives a configurable analysis_path). # Pipeline configs have no `hooks:` block, so this is a no-op there. _, _, hook_materializations = resolve_hooks(container.config.get('hooks')) + renders = [m for m in hook_materializations if isinstance(m, Render)] + if renders: + # Built-in (Render) materialization -- rendering the shipped + # `templates/hooks/.sh.jinja2` into code/hooks/ -- lands in the + # next slice (zip-as-hook). Reject cleanly until then. + raise NotImplementedError( + f'Built-in hooks not yet materialized: {[r.name for r in renders]}' + ) + copy_ins = [m for m in hook_materializations if isinstance(m, CopyIn)] self._init_import_files( container.config.get('imported_files', []) - + [m.as_import() for m in hook_materializations] + + [m.as_import() for m in copy_ins] ) # _update_inclusion_dataframe() expects a DataFrame (or None). # If --list_sub_file was provided, use the parsed DataFrame diff --git a/babs/hooks.py b/babs/hooks.py index ed7267ea..e19da93d 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -23,12 +23,17 @@ multiple splice points (e.g. a validator at ``pre_run`` and ``post_run``) -- it is copied once and referenced from each. Two *different* sources sharing a basename collide (no name override yet -- add one if needed). - -`Render` (rendering a shipped ``*.sh.jinja2`` into ``code/hooks/``) is defined -here as the forward-compatible seam, but `resolve_hooks` does not yet produce one --- entries requiring it are rejected. It is the **templated built-in** mode: the -same mechanism serves a zip hook and a container-running hook (e.g. nordic) alike --- they differ only in what the shipped template does, not in how it's produced. +- **built-in** -- ``{builtin: }``; babs renders a shipped + ``templates/hooks/.sh.jinja2`` into ``code/hooks/.sh`` at init + (``Render``). Keys beyond ``builtin`` are per-hook params for the template; + bootstrap merges in the top-level/derived render context (e.g. ``output_dir``). + ``zip`` is the first built-in. + +The **container-running templated built-in** (a babs-shipped ``singularity run`` +template parameterised by per-hook ``singularity_args``/``bids_app_args``) also +lands as a ``Render``, but is still deferred -- ``{container: ...}`` entries are +rejected for now. User-provided jinja templates are intentionally *not* +supported: a template is inert without babs code to populate its context. """ import os.path as op @@ -69,8 +74,10 @@ def as_import(self): class Render: """Templated built-in: render ``template_path`` -> ``code/hooks/.sh``. - Defined as the forward-compatible seam; `resolve_hooks` does not yet produce - one (wired in a later step alongside the shared singularity partial). + ``template_path`` is loader-relative (``PackageLoader('babs', 'templates')``). + ``context`` is the config-derived part (per-hook params); bootstrap merges in + the top-level/derived part (e.g. ``output_dir``) before rendering. Produced for + ``{builtin: }`` entries; the ``{container: ...}`` scaffold is still deferred. """ template_path: str @@ -108,8 +115,8 @@ def _resolve_entry(entry): Returns ------- - Verbatim or CopyIn - The classified mode. (`Render` is not produced in this version.) + Verbatim, CopyIn, or Render + The classified mode. """ if isinstance(entry, str): return Verbatim(command=entry) @@ -131,9 +138,21 @@ def _resolve_entry(entry): # `_init_import_files`, which raises FileNotFoundError on a bad path). return CopyIn(original_path=source, name=name) + if isinstance(entry, dict) and 'builtin' in entry: + name = entry['builtin'] + _validate_name(name, entry) + # Keys beyond `builtin` are per-hook parameters for the built-in's + # template (e.g. zip's optional `path`). They are the *config-derived* + # part of the render context; bootstrap merges in the top-level/derived + # part (e.g. `output_dir`, `processing_level`) at render time. The + # template path is loader-relative (PackageLoader('babs', 'templates')); + # an unknown built-in fails as TemplateNotFound at init. + context = {k: v for k, v in entry.items() if k != 'builtin'} + return Render(template_path=f'hooks/{name}.sh.jinja2', name=name, context=context) + raise ValueError( f'Unsupported hook entry: {entry!r}. This version supports a raw shell ' - 'string or a {script: } mapping.' + 'string, a {script: } mapping, or a {builtin: } mapping.' ) diff --git a/tests/test_hooks.py b/tests/test_hooks.py index f9d1d480..4c7c7566 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -98,7 +98,6 @@ def test_same_script_at_both_points_materializes_once(): def test_render_equality_distinguishes_context(): # The collision rule keys on descriptor equality; Render carries `context`, # so the same template rendered two ways into one name is a real conflict. - # (Render isn't produced by resolve_hooks yet -- this pins the invariant.) assert Render('zip.sh.jinja2', 'zip', {'a': 1}) == Render('zip.sh.jinja2', 'zip', {'a': 1}) assert Render('zip.sh.jinja2', 'zip', {'a': 1}) != Render('zip.sh.jinja2', 'zip', {'a': 2}) @@ -113,8 +112,9 @@ def test_non_mapping_config_raises(): resolve_hooks(['echo x']) -@pytest.mark.parametrize('entry', [{'builtin': 'zip'}, {'container': 'nordic'}, 42]) +@pytest.mark.parametrize('entry', [{'container': 'nordic'}, {'unknown': 'x'}, 42]) def test_unsupported_entry_forms_raise(entry): + # The container-running form is still deferred; arbitrary dicts / scalars are invalid. with pytest.raises(ValueError, match='Unsupported hook entry'): resolve_hooks({'pre_run': [entry]}) @@ -136,15 +136,44 @@ def test_source_with_invalid_derived_name_raises(bad_source): resolve_hooks(cfg) -def test_render_is_defined_but_never_produced(): - # The forward-compat seam exists as a type... - r = Render(template_path='t.sh.jinja2', name='zip', context={'k': 'v'}) - assert r.analysis_path == 'code/hooks/zip.sh' - # ...but resolve_hooks never returns one in this version (no config form maps to it). - _, _, materializations = resolve_hooks( - {'post_run': ['echo verbatim', {'script': '/x.sh'}]} - ) - assert all(isinstance(m, CopyIn) for m in materializations) +def test_builtin_resolves_to_render(): + cfg = {'post_run': [{'builtin': 'zip'}]} + pre_run, post_run, materializations = resolve_hooks(cfg) + assert pre_run == [] + assert post_run == ['bash ./code/hooks/zip.sh'] + assert materializations == [ + Render(template_path='hooks/zip.sh.jinja2', name='zip', context={}) + ] + assert materializations[0].analysis_path == 'code/hooks/zip.sh' + + +def test_builtin_extra_keys_become_context(): + # per-hook params (e.g. zip's optional `path`) flow into the render context + cfg = {'post_run': [{'builtin': 'zip', 'path': 'outputs/freesurfer-7-3-2'}]} + _, _, materializations = resolve_hooks(cfg) + assert materializations == [ + Render( + template_path='hooks/zip.sh.jinja2', + name='zip', + context={'path': 'outputs/freesurfer-7-3-2'}, + ) + ] + + +def test_builtin_same_name_different_context_collides(): + # descriptor equality includes context -> same name rendered two ways collides + cfg = { + 'pre_run': [{'builtin': 'zip'}], + 'post_run': [{'builtin': 'zip', 'path': 'outputs/x'}], + } + with pytest.raises(ValueError, match='Duplicate hook name'): + resolve_hooks(cfg) + + +def test_builtin_invalid_name_raises(): + cfg = {'post_run': [{'builtin': '../escape'}]} + with pytest.raises(ValueError, match='Invalid hook name'): + resolve_hooks(cfg) def test_verbatim_dataclass_shape(): From 11ed923b975f752613d8ca5f9446abdbef14900f Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 21:11:00 -0500 Subject: [PATCH 11/16] hooks: add the zip built-in template + bootstrap Render materialization `_init_render_hooks` renders each Render's shipped template (babs/templates/hooks/) into code/hooks/.sh at init and datalad saves, replacing the NotImplementedError gate. Render context = the per-hook config params plus the derived part (processing_level), with derived values taking precedence. zip.sh.jinja2 is the first built-in: unlock -> 7z inside its own `datalad run --explicit` -> remove the granular outputs in a separate commit (the datalad/datalad#7822 workaround, TODO'd for removal once babs's minimum datalad carries the fix). One hook = one zip; the archive is named ${subid}[_${sesid}]_.zip and contains basename(path) at its top level. Also wraps a pre-existing >99-char line in hooks.py that failed ruff. Co-Authored-By: Claude Fable 5 --- babs/bootstrap.py | 52 +++++++++++++++++++++++------- babs/hooks.py | 4 ++- babs/templates/hooks/zip.sh.jinja2 | 43 ++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 12 deletions(-) create mode 100644 babs/templates/hooks/zip.sh.jinja2 diff --git a/babs/bootstrap.py b/babs/bootstrap.py index de5dcf94..f9186d81 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -271,19 +271,13 @@ def babs_bootstrap( ) container = Container(container_ds, container_name, container_config) - # Copy in any other files needed, plus script hooks. Hook - # CopyIns reuse the imported_files path (destination relative to - # self.analysis_path, so it survives a configurable analysis_path). + # Materialize hooks (and copy in any other files needed). Built-in + # (Render) hooks are rendered from shipped templates; script (CopyIn) + # hooks reuse the imported_files path. Both destinations are relative + # to self.analysis_path, so they survive a configurable analysis_path. # Pipeline configs have no `hooks:` block, so this is a no-op there. _, _, hook_materializations = resolve_hooks(container.config.get('hooks')) - renders = [m for m in hook_materializations if isinstance(m, Render)] - if renders: - # Built-in (Render) materialization -- rendering the shipped - # `templates/hooks/.sh.jinja2` into code/hooks/ -- lands in the - # next slice (zip-as-hook). Reject cleanly until then. - raise NotImplementedError( - f'Built-in hooks not yet materialized: {[r.name for r in renders]}' - ) + self._init_render_hooks([m for m in hook_materializations if isinstance(m, Render)]) copy_ins = [m for m in hook_materializations if isinstance(m, CopyIn)] self._init_import_files( container.config.get('imported_files', []) @@ -602,6 +596,42 @@ def _init_import_files(self, file_list): message='Import files', ) + def _init_render_hooks(self, renders): + """ + Render built-in hook templates into ``code/hooks/`` and datalad save. + + Parameters + ---------- + renders: list of babs.hooks.Render + Built-in hooks to materialize. Each is rendered from its shipped + template (``babs/templates/hooks/``) with a context of the per-hook + config params plus the derived part supplied here + (``processing_level``). + """ + if not renders: + return + env = Environment( + loader=PackageLoader('babs', 'templates'), + trim_blocks=True, + lstrip_blocks=True, + autoescape=False, + undefined=StrictUndefined, + ) + rendered_files = [] + for render in renders: + # Derived values win; a per-hook config param can't override them. + context = {**render.context, 'processing_level': self.processing_level} + content = env.get_template(render.template_path).render(**context) + destination = op.join(self.analysis_path, render.analysis_path) + os.makedirs(op.dirname(destination), exist_ok=True) + with open(destination, 'w') as f: + f.write(content) + rendered_files.append(render.analysis_path) + self.datalad_save( + path=rendered_files, + message='Materialize built-in hook scripts', + ) + def clean_up(self): """ If `babs init` failed, this function cleans up the BABS project `babs init` creates. diff --git a/babs/hooks.py b/babs/hooks.py index e19da93d..9e789631 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -226,7 +226,9 @@ def resolve_hooks(hooks_config): prior, prior_point = seen[mode.name] if mode == prior: continue # same file; already materialized - where = repr(point) if prior_point == point else f'{prior_point!r} and {point!r}' + where = ( + repr(point) if prior_point == point else f'{prior_point!r} and {point!r}' + ) raise ValueError( f'Duplicate hook name {mode.name!r} ({where}): two different ' f'hooks both resolve to {op.join(HOOKS_SUBDIR, mode.name + ".sh")!r}.' diff --git a/babs/templates/hooks/zip.sh.jinja2 b/babs/templates/hooks/zip.sh.jinja2 new file mode 100644 index 00000000..99685386 --- /dev/null +++ b/babs/templates/hooks/zip.sh.jinja2 @@ -0,0 +1,43 @@ +#!/bin/bash +# Built-in babs `zip` hook: archive a BIDS App output folder, commit the +# archive, and remove the granular outputs it replaces. +# +# Rendered once at `babs init` into `code/hooks/zip.sh`. Runs at the +# `post_run` splice point (cwd = the job's dataset clone) as a separate +# process: `subid` (and `sesid` at session level) arrive via the exported +# splice-point contract. +set -e -u -x + +{# `path` is the folder to zip, e.g. `outputs/fmriprep_minimal-25-2-5`. #} +{# The archive is written to the dataset root and contains the folder #} +{# itself (not its parents), matching the layout of babs zips to date. #} +{% set zip_folder = path.rsplit('/', 1)[-1] %} +{% set zip_dir = path.rsplit('/', 1)[0] if '/' in path else '.' %} +{% set updirs = '../' * (zip_dir.count('/') + 1) if zip_dir != '.' else '' %} +# subid/sesid are exported by the splice-point subshell in participant_job.sh +# shellcheck disable=SC2154 +{% if processing_level == 'session' %} +ZIP_ID="${subid}_${sesid}" +{% else %} +ZIP_ID="${subid}" +{% endif %} +ZIP_NAME="${ZIP_ID}_{{ zip_folder }}.zip" + +# Zip real file content, not annex symlinks: +datalad unlock "{{ path }}" + +datalad run \ + --explicit \ + --output "${ZIP_NAME}" \ + -m "Zip {{ path }} for ${ZIP_ID}" \ + -- \ + "cd {{ zip_dir }} && 7z a {{ updirs }}${ZIP_NAME} {{ zip_folder }}" + +# `datalad run --explicit` does not track deletions, so the granular outputs +# are removed in a separate commit (workaround for datalad/datalad#7822, +# since fixed upstream). +# TODO research which datalad version shipped the datalad/datalad#7822 fix; +# once babs's minimum supported datalad is at or above it, fold this removal +# into the datalad run above and drop this step. +git rm -rf -q --sparse "{{ path }}" +git commit -m "Remove {{ path }} for ${ZIP_ID} (zipped)" From df2c8ec923f0c0e9e754642ed722b30b1dcd0176 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 21:11:55 -0500 Subject: [PATCH 12/16] docs: document the built-in hook form + the zip built-in Co-Authored-By: Claude Fable 5 --- docs/preparation_config_yaml_file.rst | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/preparation_config_yaml_file.rst b/docs/preparation_config_yaml_file.rst index a0e102b7..eea66cfc 100644 --- a/docs/preparation_config_yaml_file.rst +++ b/docs/preparation_config_yaml_file.rst @@ -185,8 +185,10 @@ Example section **hooks** - script: "/path/to/validate-inputs.sh" # a script copied into the project post_run: - script: "/path/to/validate-outputs.sh" + - builtin: zip # a built-in shipped with BABS + path: outputs/fmriprep_minimal-25-2-5 -Two entry forms are supported: +Three entry forms are supported: - **snippet** — a bare string. It is spliced **verbatim** into the job script and runs *inline*. You own its quoting and safety (this is shell injection by @@ -195,6 +197,22 @@ Two entry forms are supported: (copied into the project the same way as ``imported_files``). BABS copies it to ``code/hooks/.sh`` at ``babs init`` and the splice runs ``bash ./code/hooks/.sh`` — a **separate process**. +- **built-in** — ``{builtin: }``. The hook ships with BABS as a template; + ``babs init`` renders it into ``code/hooks/.sh`` (git-tracked, so you can + read exactly what will run) and the splice runs it like a script hook. Keys + beyond ``builtin`` are parameters for that built-in. + +Built-in: ``zip`` +----------------- + +``{builtin: zip, path: }`` archives one output folder as a ``post_run`` +hook. ``path`` is the folder to zip, relative to the dataset root (e.g. +``outputs/fmriprep_minimal-25-2-5``). The hook zips it into +``${subid}[_${sesid}]_.zip`` at the dataset root inside its **own** +``datalad run`` (so the archive is committed with provenance), then removes the +granular folder in a follow-up commit. The archive contains the folder itself +(e.g. ``fmriprep_minimal-25-2-5/``) at its top level. To produce several +separate archives, list several zip hooks, each with its own ``path``. The runtime contract -------------------- From 5ddf968d10732d317e9581b02a4378aaaf7ffaf9 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 22:07:50 -0500 Subject: [PATCH 13/16] tests: init-level coverage for the zip built-in (+ demo block) Asserts {builtin: zip, path: ...} renders code/hooks/zip.sh (id, archive name, 7z line, granular removal), splices once at post_run, and lands the "Materialize built-in hook scripts" init commit. The TODO-fenced demo block copies the rendered artifacts into demo-output/ for inspection after a docker run; remove before PR assembly. Co-Authored-By: Claude Fable 5 --- tests/test_babs_workflow.py | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tests/test_babs_workflow.py b/tests/test_babs_workflow.py index 3479b03a..f03d72ec 100644 --- a/tests/test_babs_workflow.py +++ b/tests/test_babs_workflow.py @@ -305,6 +305,80 @@ def test_babs_init_single_app_hooks( assert participant_job.count('bash ./code/hooks/echo_hook.sh') == 2 +def test_babs_init_builtin_zip_hook( + tmp_path_factory, + bids_data_singlesession, + simbids_container_ds, +): + """`babs init` renders the zip built-in into code/hooks/ and splices it. + + Render-level check of the ``{builtin: zip}`` form: the shipped + hooks/zip.sh.jinja2 is materialized with the per-hook ``path`` param plus + the derived ``processing_level``, committed to the analysis dataset, and + referenced from participant_job.sh at post_run. + """ + project_base = tmp_path_factory.mktemp('zip_hook_project') + project_root = project_base / 'my_babs_project' + + container_config = update_yaml_for_run( + project_base, + get_config_simbids_path().name, + {'BIDS': bids_data_singlesession}, + ) + with open(container_config) as f: + cfg = yaml.safe_load(f) + cfg['hooks'] = { + 'post_run': [{'builtin': 'zip', 'path': 'outputs/fmriprep_anat-24-1-1'}], + } + with open(container_config, 'w') as f: + yaml.safe_dump(cfg, f) + + babs_init_opts = argparse.Namespace( + project_root=project_root, + list_sub_file=None, + container_ds=simbids_container_ds, + container_name='simbids-0-0-3', + container_config=container_config, + processing_level='subject', + queue='slurm', + keep_if_failed=False, + ) + with mock.patch.object(argparse.ArgumentParser, 'parse_args', return_value=babs_init_opts): + _enter_init() + + analysis = project_root / 'analysis' + zip_sh = analysis / 'code' / 'hooks' / 'zip.sh' + assert zip_sh.exists() + rendered = zip_sh.read_text() + # Subject-level id; archive name from basename(path); zip from inside the + # parent so the archive contains the folder itself; granular removal: + assert 'ZIP_ID="${subid}"' in rendered + assert '${sesid}' not in rendered + assert 'ZIP_NAME="${ZIP_ID}_fmriprep_anat-24-1-1.zip"' in rendered + assert 'cd outputs && 7z a ../${ZIP_NAME} fmriprep_anat-24-1-1' in rendered + assert 'git rm -rf -q --sparse "outputs/fmriprep_anat-24-1-1"' in rendered + # Spliced at post_run only: + participant_job = (analysis / 'code' / 'participant_job.sh').read_text() + assert participant_job.count('bash ./code/hooks/zip.sh') == 1 + # Committed at init: + log = subprocess.run( + ['git', '-C', str(analysis), 'log', '--oneline'], + capture_output=True, + text=True, + check=True, + ).stdout + assert 'Materialize built-in hook scripts' in log + + # TODO remove demo: persist rendered artifacts outside the container so + # they can be inspected after the docker run (the worktree is bind-mounted). + demo_dir = Path(__file__).parent.parent / 'demo-output' + demo_dir.mkdir(exist_ok=True) + (demo_dir / 'zip.sh').write_text(rendered) + (demo_dir / 'participant_job.sh').write_text(participant_job) + (demo_dir / 'analysis-git-log.txt').write_text(log) + # TODO remove demo end + + def test_init_forwards_shared_group(tmp_path): """Test that CLI --shared-group is forwarded to bootstrap.""" options = argparse.Namespace( From 6df5092e5199f18af439422b84577a3b71e76478 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 22:45:20 -0500 Subject: [PATCH 14/16] hooks: validate built-in names and params against a registry BUILTIN_PARAMS maps each built-in to its accepted per-hook params (zip -> {path}). An unknown builtin or a typo'd param (e.g. `paht:`) now fails at resolve time instead of flowing silently into the render context (StrictUndefined only catches missing context, not unexpected keys). The registry also subsumes name validation for built-ins: only known names resolve, so a path-escaping name can't reach the template loader. Also wraps a >99-char line in test_hooks.py that failed ruff. Co-Authored-By: Claude Fable 5 --- babs/hooks.py | 22 ++++++++++++++++++---- tests/test_hooks.py | 19 +++++++++++++++---- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/babs/hooks.py b/babs/hooks.py index 9e789631..b203bfc9 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -45,6 +45,12 @@ # The splice points a hooks config may target, in run order. SPLICE_POINTS = ('pre_run', 'post_run') +# Known built-ins and the per-hook params each accepts. Also the registry of +# valid `{builtin: }` names, so a typo'd name or param fails at +# resolve time (StrictUndefined only catches *missing* template context, not +# unexpected keys, which would otherwise flow in silently). +BUILTIN_PARAMS = {'zip': {'path'}} + @dataclass(frozen=True) class Verbatim: @@ -140,13 +146,21 @@ def _resolve_entry(entry): if isinstance(entry, dict) and 'builtin' in entry: name = entry['builtin'] - _validate_name(name, entry) + if name not in BUILTIN_PARAMS: + raise ValueError( + f'Unknown built-in hook {name!r}; available: {sorted(BUILTIN_PARAMS)}.' + ) + unknown = set(entry) - {'builtin'} - BUILTIN_PARAMS[name] + if unknown: + raise ValueError( + f'Unsupported key(s) {sorted(unknown)} for built-in hook {name!r}; ' + f'it accepts {sorted(BUILTIN_PARAMS[name])}.' + ) # Keys beyond `builtin` are per-hook parameters for the built-in's # template (e.g. zip's optional `path`). They are the *config-derived* # part of the render context; bootstrap merges in the top-level/derived - # part (e.g. `output_dir`, `processing_level`) at render time. The - # template path is loader-relative (PackageLoader('babs', 'templates')); - # an unknown built-in fails as TemplateNotFound at init. + # part (e.g. `processing_level`) at render time. The template path is + # loader-relative (PackageLoader('babs', 'templates')). context = {k: v for k, v in entry.items() if k != 'builtin'} return Render(template_path=f'hooks/{name}.sh.jinja2', name=name, context=context) diff --git a/tests/test_hooks.py b/tests/test_hooks.py index 4c7c7566..c9d5bae3 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -68,7 +68,8 @@ def test_different_sources_same_name_collide(): 'pre_run': [{'script': '/a/validate.sh'}], 'post_run': [{'script': '/b/validate.sh'}], } - with pytest.raises(ValueError, match=r"Duplicate hook name 'validate' \('pre_run' and 'post_run'\)"): + expected = r"Duplicate hook name 'validate' \('pre_run' and 'post_run'\)" + with pytest.raises(ValueError, match=expected): resolve_hooks(cfg) @@ -170,9 +171,19 @@ def test_builtin_same_name_different_context_collides(): resolve_hooks(cfg) -def test_builtin_invalid_name_raises(): - cfg = {'post_run': [{'builtin': '../escape'}]} - with pytest.raises(ValueError, match='Invalid hook name'): +@pytest.mark.parametrize('bad_name', ['../escape', 'gzip', '']) +def test_builtin_unknown_name_raises(bad_name): + # The registry doubles as name validation: only known built-ins resolve, + # so a path-escaping or typo'd name fails at resolve time. + cfg = {'post_run': [{'builtin': bad_name}]} + with pytest.raises(ValueError, match='Unknown built-in hook'): + resolve_hooks(cfg) + + +def test_builtin_unknown_param_raises(): + # A typo'd param must not flow silently into the render context. + cfg = {'post_run': [{'builtin': 'zip', 'paht': 'outputs/x'}]} + with pytest.raises(ValueError, match="Unsupported key.*for built-in hook 'zip'"): resolve_hooks(cfg) From 1043d48debbe24fccc0b286f0611afc11b7e0573 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 9 Jun 2026 23:03:25 -0500 Subject: [PATCH 15/16] single-app: rename _zip.sh -> _run.sh; output_dir as -o source The inner run script stops being named for zipping (zip moves to the post_run hook). generate_submit_script grows an optional `output_dir` (mutually exclusive with the pipeline-only `zip_foldernames`): when given, the datalad run declares the app output folder itself as the output, committing granular results. Single-app still passes zip_foldernames until the config-surface swap lands; pipeline mode is untouched. Co-Authored-By: Claude Fable 5 --- babs/bootstrap.py | 12 ++++++------ babs/check_setup.py | 2 +- babs/generate_submit_script.py | 13 ++++++++++--- babs/templates/participant_job.sh.jinja2 | 7 +++++-- tests/test_base.py | 2 +- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/babs/bootstrap.py b/babs/bootstrap.py index f9186d81..80a9c182 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -423,12 +423,12 @@ def _bootstrap_single_app_scripts( """Bootstrap scripts for single BIDS app configuration.""" container = Container(container_ds, container_name, container_config) - # Generate `_zip.sh`: ---------------------------------- - # which is a bash script of singularity run + zip + # Generate `_run.sh`: ---------------------------------- + # which is a bash script of singularity run # in folder: `analysis/code` - print('\nGenerating a bash script for running container and zipping the outputs...') - print('This bash script will be named as `' + container_name + '_zip.sh`') - bash_path = op.join(self.analysis_path, 'code', container_name + '_zip.sh') + print('\nGenerating a bash script for running the container...') + print('This bash script will be named as `' + container_name + '_run.sh`') + bash_path = op.join(self.analysis_path, 'code', container_name + '_run.sh') shared_group_mode = self.shared_group is not None container.generate_bash_run_bidsapp( bash_path, @@ -437,7 +437,7 @@ def _bootstrap_single_app_scripts( shared_group_mode=shared_group_mode, ) self.datalad_save( - path='code/' + container_name + '_zip.sh', + path='code/' + container_name + '_run.sh', message='Generate script of running container', ) diff --git a/babs/check_setup.py b/babs/check_setup.py index 86bf9dbd..d332865c 100644 --- a/babs/check_setup.py +++ b/babs/check_setup.py @@ -156,7 +156,7 @@ def babs_check_setup(self, submit_a_test_job): else: list_files_code = [ 'babs_proj_config.yaml', - container_name + '_zip.sh', + container_name + '_run.sh', 'participant_job.sh', 'submit_job_template.yaml', ] diff --git a/babs/generate_submit_script.py b/babs/generate_submit_script.py index f6f16517..f88a0f0c 100644 --- a/babs/generate_submit_script.py +++ b/babs/generate_submit_script.py @@ -25,7 +25,8 @@ def generate_submit_script( input_datasets, processing_level, container_name, - zip_foldernames, + zip_foldernames=None, + output_dir=None, run_script_relpath=None, container_images=None, datalad_run_message=None, @@ -52,8 +53,13 @@ def generate_submit_script( Processing level ('subject' or 'session'). container_name : str Name of the container. - zip_foldernames : dict - Dictionary mapping output names to versions. + zip_foldernames : dict, optional + Dictionary mapping output names to versions. Pipeline mode only: the + ``datalad run`` declares the corresponding per-subject zips as outputs. + output_dir : str, optional + Single-app mode: the app output folder, declared as the ``datalad run`` + output (the run commits granular outputs; zipping is a ``post_run`` + hook). Mutually exclusive with ``zip_foldernames``. run_script_relpath : str, optional Path to script executed by datalad run. None for single-app mode. container_images : list, optional @@ -127,6 +133,7 @@ def generate_submit_script( zip_locator_text=zip_locator_text, container_name=container_name, zip_foldernames=zip_foldernames, + output_dir=output_dir, varname_jobid=varname_jobid, varname_taskid=varname_taskid, input_datasets=input_datasets, diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index 7cd68d70..776b7b02 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -145,7 +145,7 @@ done # datalad run: datalad run \ - -i "{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }}" \ + -i "{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_run.sh' }}" \ {% for input_dataset in input_datasets %} {% if not input_dataset['is_zipped'] %} -i "{{ input_dataset['unzipped_path_containing_subject_dirs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ @@ -161,13 +161,16 @@ datalad run \ --expand inputs \ {% endif %} --explicit \ +{% if output_dir %} + -o "{{ output_dir }}" \ +{% endif %} {% if zip_foldernames is not none %} {% for key, value in zip_foldernames.items() %} -o "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}_{% raw %}${sesid}{% endraw %}{% endif %}_{{ key }}-{{ value }}.zip" \ {% endfor %} {% endif %} -m "{{ (datalad_run_message if datalad_run_message is defined and datalad_run_message else container_name) }} {% raw %}${subid}{% endraw %}{% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}" \ - "bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }} {% raw %}${subid}{% endraw %} {% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}" + "bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_run.sh' }} {% raw %}${subid}{% endraw %} {% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}" {% if hook_post_run %} # post_run hooks: spliced after the run, before push; subshell exports the contract. ( diff --git a/tests/test_base.py b/tests/test_base.py index 22d2ae75..237ba328 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -349,7 +349,7 @@ def test_shared_group_inits_analysis_and_rias( # Shared mode should generate scripts with explicit group mode 0o770. for check_path in ( Path(babs_bootstrap.analysis_path) / 'code' / 'participant_job.sh', - Path(babs_bootstrap.analysis_path) / 'code' / 'simbids-0-0-3_zip.sh', + Path(babs_bootstrap.analysis_path) / 'code' / 'simbids-0-0-3_run.sh', Path(babs_bootstrap.analysis_path) / 'code' / 'check_setup' / 'call_test_job.sh', Path(babs_bootstrap.analysis_path) / 'code' / 'check_setup' / 'test_job.py', ): From aaa5d083abfdb2062ddc03244af7bbf05fa5f69c Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 10 Jun 2026 12:28:31 -0500 Subject: [PATCH 16/16] single-app: replace zip_foldernames/all_results_in_one_zip with output_dir The top-level output_dir (the folder the app writes into, carrying the versioned derivative name) is now the single source for the app write dir, the datalad run -o declaration, and the default zip-hook path. Legacy keys hard-error with a migration message (app_output_settings_from_config stays for pipeline mode; dies in PR 3). - bidsapp_run.sh.jinja2 de-zipped: cmd_zip gone, and the granular outputs survive the script (the run now commits them; the zip hook owns their removal) - the zip built-in becomes a STATIC script taking [] as splice-site arguments (visible in participant_job.sh): path defaults to output_dir at resolve time, name (a free-form archive stem, the old map value's only real job) defaults to basename(path) at runtime, and sesid presence encodes the processing level. All zip instances share one code/hooks/zip.sh; multi-zip differs only in args. The Render seam stays reserved for the container-running built-ins (PR 3). - notebooks/eg_*.yaml + test fixtures migrated; docs: output_dir section replaces zip_foldernames; raw_bids e2e flips post_run to [contract guard, zip] (guard before zip -- ordering is load-bearing) WIP: not yet run in docker (units + e2e pending; docker held by the PR-1 CI session). Co-Authored-By: Claude Fable 5 --- babs/bootstrap.py | 11 +- babs/container.py | 15 +- babs/generate_bidsapp_runscript.py | 8 +- babs/hooks.py | 99 ++++++--- babs/templates/bidsapp_run.sh.jinja2 | 3 +- babs/templates/hooks/zip.sh | 49 +++++ babs/templates/hooks/zip.sh.jinja2 | 43 ---- babs/utils.py | 51 +++++ docs/preparation_config_yaml_file.rst | 204 +++++++----------- docs/walkthrough.rst | 5 +- notebooks/eg_aslprep-0-7-5.yaml | 9 +- notebooks/eg_fmriprep-24-1-1_anatonly.yaml | 9 +- .../eg_fmriprep-24-1-1_ingressed-fs.yaml | 9 +- notebooks/eg_fmriprep-24-1-1_regular.yaml | 16 +- ..._freesurferpost-unstable_ingressed-fs.yaml | 9 +- notebooks/eg_mriqc-24-0-2.yaml | 9 +- notebooks/eg_qsiprep-1-0-0_regular.yaml | 9 +- notebooks/eg_qsirecon-1-0-1_custom_spec.yaml | 10 +- notebooks/eg_qsirecon-1-0-1_hsvs.yaml | 9 +- notebooks/eg_simbids_0-0-3_multiinput.yaml | 9 +- notebooks/eg_simbids_0-0-3_raw_mri.yaml | 9 +- ..._toybidsapp-0-0-7_rawBIDS-walkthrough.yaml | 12 +- notebooks/eg_toybidsapp-0-0-7_zipped.yaml | 12 +- notebooks/eg_xcpd-0-10-6_linc.yaml | 9 +- tests/e2e-slurm/container/config_simbids.yaml | 4 +- .../container/config_simbids_multiinput.yaml | 4 +- .../simbids_fmriprep-24-1-1_anatonly.yaml | 4 +- tests/test_babs_workflow.py | 42 ++-- tests/test_generate_bidsapp_runscript.py | 6 +- tests/test_generate_submit_script.py | 4 +- tests/test_hooks.py | 63 ++++-- tests/test_utils.py | 29 +++ 32 files changed, 479 insertions(+), 305 deletions(-) create mode 100644 babs/templates/hooks/zip.sh delete mode 100644 babs/templates/hooks/zip.sh.jinja2 diff --git a/babs/bootstrap.py b/babs/bootstrap.py index 80a9c182..3fdaacf6 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -19,6 +19,7 @@ from babs.system import System, validate_queue from babs.utils import ( get_datalad_version, + output_dir_from_config, validate_processing_level, ) @@ -275,8 +276,14 @@ def babs_bootstrap( # (Render) hooks are rendered from shipped templates; script (CopyIn) # hooks reuse the imported_files path. Both destinations are relative # to self.analysis_path, so they survive a configurable analysis_path. - # Pipeline configs have no `hooks:` block, so this is a no-op there. - _, _, hook_materializations = resolve_hooks(container.config.get('hooks')) + # Pipeline configs have no `hooks:` block, so this is a no-op there + # (and output_dir -- which would hard-error on a pipeline config's + # legacy zip keys -- is only derived when hooks are configured). + hooks_config = container.config.get('hooks') + _, _, hook_materializations = resolve_hooks( + hooks_config, + output_dir=output_dir_from_config(container.config) if hooks_config else None, + ) self._init_render_hooks([m for m in hook_materializations if isinstance(m, Render)]) copy_ins = [m for m in hook_materializations if isinstance(m, CopyIn)] self._init_import_files( diff --git a/babs/container.py b/babs/container.py index f785cd9f..ee6e199b 100644 --- a/babs/container.py +++ b/babs/container.py @@ -9,7 +9,7 @@ from babs.generate_bidsapp_runscript import generate_bidsapp_runscript from babs.generate_submit_script import generate_submit_script, generate_test_submit_script from babs.hooks import resolve_hooks -from babs.utils import app_output_settings_from_config +from babs.utils import output_dir_from_config class Container: @@ -128,16 +128,12 @@ def generate_bash_run_bidsapp( input_datasets = input_ds.as_records() templateflow_home = os.getenv('TEMPLATEFLOW_HOME') - # What should the outputs look like? - dict_zip_foldernames, bids_app_output_dir = app_output_settings_from_config(self.config) - script_content = generate_bidsapp_runscript( input_datasets, processing_level, container_name=self.container_name, relative_container_path=self.container_path_relToAnalysis, - bids_app_output_dir=bids_app_output_dir, - dict_zip_foldernames=dict_zip_foldernames, + bids_app_output_dir=output_dir_from_config(self.config), bids_app_args=self.config.get('bids_app_args', None), singularity_args=self.config.get('singularity_args', []), templateflow_home=templateflow_home, @@ -179,7 +175,10 @@ def generate_bash_participant_job( If True, align generated script permissions with shared-group mode. """ - hook_pre_run, hook_post_run, _ = resolve_hooks(self.config.get('hooks')) + output_dir = output_dir_from_config(self.config) + hook_pre_run, hook_post_run, _ = resolve_hooks( + self.config.get('hooks'), output_dir=output_dir + ) script_content = generate_submit_script( queue_system=system.type, cluster_resources_config=self.config['cluster_resources'], @@ -188,7 +187,7 @@ def generate_bash_participant_job( input_datasets=input_ds.as_records(), processing_level=processing_level, container_name=self.container_name, - zip_foldernames=self.config['zip_foldernames'], + output_dir=output_dir, project_root=project_root, hook_pre_run=hook_pre_run, hook_post_run=hook_post_run, diff --git a/babs/generate_bidsapp_runscript.py b/babs/generate_bidsapp_runscript.py index a24db5fc..0df018a1 100644 --- a/babs/generate_bidsapp_runscript.py +++ b/babs/generate_bidsapp_runscript.py @@ -14,7 +14,6 @@ def generate_bidsapp_runscript( container_name, relative_container_path, bids_app_output_dir, - dict_zip_foldernames, bids_app_args=None, singularity_args=None, templateflow_home=None, @@ -43,7 +42,7 @@ def generate_bidsapp_runscript( The contents of the bash script that runs the BIDS App singularity image. """ - from .constants import OUTPUT_MAIN_FOLDERNAME, PATH_FS_LICENSE_IN_CONTAINER + from .constants import PATH_FS_LICENSE_IN_CONTAINER # 1. check `bids_app_args` section: if bids_app_args is None: @@ -73,9 +72,6 @@ def generate_bidsapp_runscript( # Get unzip commands for any zipped input datasets cmd_unzip_inputds = get_input_unzipping_cmds(input_datasets) - # Generate zip command - cmd_zip = get_output_zipping_cmds(dict_zip_foldernames, processing_level) - # Render the template env = Environment( loader=PackageLoader('babs', 'templates'), @@ -101,8 +97,6 @@ def generate_bidsapp_runscript( bids_app_input_dir=bids_app_input_dir, bids_app_output_dir=bids_app_output_dir, bids_app_args=bids_app_args, - cmd_zip=cmd_zip, - OUTPUT_MAIN_FOLDERNAME=OUTPUT_MAIN_FOLDERNAME, singularity_flags=singularity_args, subject_selection_flag=subject_selection_flag, ) diff --git a/babs/hooks.py b/babs/hooks.py index b203bfc9..240ccb2e 100644 --- a/babs/hooks.py +++ b/babs/hooks.py @@ -23,11 +23,15 @@ multiple splice points (e.g. a validator at ``pre_run`` and ``post_run``) -- it is copied once and referenced from each. Two *different* sources sharing a basename collide (no name override yet -- add one if needed). -- **built-in** -- ``{builtin: }``; babs renders a shipped - ``templates/hooks/.sh.jinja2`` into ``code/hooks/.sh`` at init - (``Render``). Keys beyond ``builtin`` are per-hook params for the template; - bootstrap merges in the top-level/derived render context (e.g. ``output_dir``). - ``zip`` is the first built-in. +- **built-in** -- ``{builtin: }``; the hook ships inside the babs + package. A *static* built-in (``templates/hooks/.sh``) is copied in + like a script hook (``CopyIn``) and its per-hook params become shell + **arguments** at the splice site, with defaults resolved at config time + (e.g. zip's ``path`` defaults to the top-level ``output_dir``) -- so several + instances share one script and differ only in args. A *templated* built-in + (``templates/hooks/.sh.jinja2``) is rendered at init (``Render``); + none ship yet -- the container-running built-ins (NORDIC) are its first + real consumer. ``zip`` is the first built-in, and it is static. The **container-running templated built-in** (a babs-shipped ``singularity run`` template parameterised by per-hook ``singularity_args``/``bids_app_args``) also @@ -37,19 +41,25 @@ """ import os.path as op +import shlex from dataclasses import dataclass, field # Where copied/rendered hook scripts land inside the analysis dataset. HOOKS_SUBDIR = op.join('code', 'hooks') +# Where the packaged built-in hook files live. +BUILTIN_SOURCE_DIR = op.join(op.dirname(__file__), 'templates', 'hooks') + # The splice points a hooks config may target, in run order. SPLICE_POINTS = ('pre_run', 'post_run') # Known built-ins and the per-hook params each accepts. Also the registry of # valid `{builtin: }` names, so a typo'd name or param fails at -# resolve time (StrictUndefined only catches *missing* template context, not -# unexpected keys, which would otherwise flow in silently). -BUILTIN_PARAMS = {'zip': {'path'}} +# resolve time. STATIC_BUILTINS names the ones shipped as plain `.sh` (copied +# in; params become splice-site arguments); the rest are `.sh.jinja2` +# templates (rendered at init; params become render context) -- none yet. +BUILTIN_PARAMS = {'zip': {'path', 'name'}} +STATIC_BUILTINS = {'zip'} @dataclass(frozen=True) @@ -111,21 +121,46 @@ def _default_name(source): return base[:-3] if base.endswith('.sh') else base -def _resolve_entry(entry): - """Classify a single config entry into a materialization mode. +def _builtin_args(name, entry, output_dir): + """Resolve a static built-in's params into its splice-site arguments. + + zip: `` []``. ``path`` defaults to the config's top-level + ``output_dir`` (the argless common case: archive the whole app output); + ``name`` is only passed when configured (the script defaults it to + ``path``'s basename). + """ + path = entry.get('path', output_dir) + if not path: + raise ValueError( + f'The {name!r} built-in needs a folder to archive: give it a ' + '`path` param or set the top-level `output_dir`.' + ) + return (path,) if 'name' not in entry else (path, entry['name']) + + +def _resolve_entry(entry, output_dir=None): + """Classify a single config entry into a materialization mode + arguments. Parameters ---------- entry : str or dict One item from a ``pre_run:`` / ``post_run:`` list. + output_dir : str or None + The config's top-level ``output_dir``; defaults static built-in args + (zip's ``path``). Returns ------- - Verbatim, CopyIn, or Render + mode : Verbatim, CopyIn, or Render The classified mode. + args : tuple of str + Splice-site arguments for the hook's command (static built-ins only; + empty otherwise). Args belong to the *command*, not the + materialization: several instances of one static built-in share a + single materialized file and differ only in args. """ if isinstance(entry, str): - return Verbatim(command=entry) + return Verbatim(command=entry), () if isinstance(entry, dict) and 'script' in entry: unknown = set(entry) - {'script'} @@ -142,7 +177,7 @@ def _resolve_entry(entry): # `source` is used verbatim as the copy source -- same convention as # `imported_files.original_path` (an absolute local path; resolved by # `_init_import_files`, which raises FileNotFoundError on a bad path). - return CopyIn(original_path=source, name=name) + return CopyIn(original_path=source, name=name), () if isinstance(entry, dict) and 'builtin' in entry: name = entry['builtin'] @@ -156,13 +191,22 @@ def _resolve_entry(entry): f'Unsupported key(s) {sorted(unknown)} for built-in hook {name!r}; ' f'it accepts {sorted(BUILTIN_PARAMS[name])}.' ) - # Keys beyond `builtin` are per-hook parameters for the built-in's - # template (e.g. zip's optional `path`). They are the *config-derived* - # part of the render context; bootstrap merges in the top-level/derived - # part (e.g. `processing_level`) at render time. The template path is - # loader-relative (PackageLoader('babs', 'templates')). + # A static built-in is copied in like a script hook; its params become + # splice-site arguments (resolved here, at config time, so defaults + # like zip's path <- output_dir are visible in participant_job.sh). + if name in STATIC_BUILTINS: + source = op.join(BUILTIN_SOURCE_DIR, f'{name}.sh') + return CopyIn(original_path=source, name=name), _builtin_args( + name, entry, output_dir + ) + # A templated built-in renders at init; keys beyond `builtin` are the + # *config-derived* part of its render context, and bootstrap merges in + # the top-level/derived part (e.g. `processing_level`) at render time. + # The template path is loader-relative (PackageLoader('babs', + # 'templates')). None ship yet (zip is static); the container-running + # built-ins (PR 3) land here. context = {k: v for k, v in entry.items() if k != 'builtin'} - return Render(template_path=f'hooks/{name}.sh.jinja2', name=name, context=context) + return Render(template_path=f'hooks/{name}.sh.jinja2', name=name, context=context), () raise ValueError( f'Unsupported hook entry: {entry!r}. This version supports a raw shell ' @@ -170,16 +214,19 @@ def _resolve_entry(entry): ) -def _command_for(mode): +def _command_for(mode, args=()): """The runtime command string a classified mode contributes to its splice list.""" if isinstance(mode, Verbatim): return mode.command if isinstance(mode, (CopyIn, Render)): - return f'bash ./{mode.analysis_path}' + command = f'bash ./{mode.analysis_path}' + if args: + command += ' ' + ' '.join(shlex.quote(str(arg)) for arg in args) + return command raise TypeError(f'Unknown hook mode: {mode!r}') -def resolve_hooks(hooks_config): +def resolve_hooks(hooks_config, output_dir=None): """Resolve a ``hooks:`` config block into spliceable commands + materializations. Parameters @@ -187,6 +234,10 @@ def resolve_hooks(hooks_config): hooks_config : dict or None The top-level ``hooks:`` block (``{'pre_run': [...], 'post_run': [...]}``), or ``None`` when no hooks are configured. + output_dir : str or None + The config's top-level ``output_dir``; used to default static + built-in arguments (the argless ``{builtin: zip}`` archives it). + Only consulted when such a hook is configured. Returns ------- @@ -226,8 +277,8 @@ def resolve_hooks(hooks_config): for point in SPLICE_POINTS: for entry in hooks_config.get(point) or []: - mode = _resolve_entry(entry) - resolved[point].append(_command_for(mode)) + mode, args = _resolve_entry(entry, output_dir) + resolved[point].append(_command_for(mode, args)) if isinstance(mode, (CopyIn, Render)): # Collision is about the materialized file, not the name alone: # the *same* hook used at multiple splice points (identical diff --git a/babs/templates/bidsapp_run.sh.jinja2 b/babs/templates/bidsapp_run.sh.jinja2 index 12b5da0e..1089bd84 100644 --- a/babs/templates/bidsapp_run.sh.jinja2 +++ b/babs/templates/bidsapp_run.sh.jinja2 @@ -70,8 +70,7 @@ singularity run \ {% endfor %} {{ subject_selection_flag }} "${subid}" -{{ cmd_zip }} -rm -rf {{ OUTPUT_MAIN_FOLDERNAME }} .git/tmp/wkdir +rm -rf .git/tmp/wkdir {% if flag_filterfile %} rm -f "${filterfile}" {% endif %} \ No newline at end of file diff --git a/babs/templates/hooks/zip.sh b/babs/templates/hooks/zip.sh new file mode 100644 index 00000000..7e756a05 --- /dev/null +++ b/babs/templates/hooks/zip.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Built-in babs `zip` hook: archive a BIDS App output folder, commit the +# archive, and remove the granular outputs it replaces. +# +# Copied verbatim at `babs init` into `code/hooks/zip.sh`. Runs at the +# `post_run` splice point (cwd = the job's dataset clone) as a separate +# process: `subid` (and `sesid` at session level) arrive via the exported +# splice-point contract; what to zip arrives as arguments: +# +# zip.sh [] +# +# is the folder to zip, relative to the dataset root. is the +# archive-name stem (the X in ${subid}[_${sesid}]_X.zip), defaulting to +# 's basename. The archive is written to the dataset root and contains +# the folder itself (not its parents), matching the layout of babs zips to +# date. +set -e -u -x + +path="$1" +name="${2:-$(basename "$path")}" +zip_dir="$(dirname "$path")" +zip_folder="$(basename "$path")" + +# subid is exported by the splice-point subshell in participant_job.sh; +# sesid only at session level, so its presence encodes the processing level. +# shellcheck disable=SC2154 +ZIP_ID="${subid}${sesid:+_${sesid}}" +ZIP_NAME="${ZIP_ID}_${name}.zip" + +# Zip real file content, not annex symlinks: +datalad unlock "${path}" + +# cd into the parent so the archive contains the folder at its top level; +# OLDPWD (the dataset root) is where the archive lands. +datalad run \ + --explicit \ + --output "${ZIP_NAME}" \ + -m "Zip ${path} for ${ZIP_ID}" \ + -- \ + "cd ${zip_dir} && 7z a \"\${OLDPWD}/${ZIP_NAME}\" ${zip_folder}" + +# `datalad run --explicit` does not track deletions, so the granular outputs +# are removed in a separate commit (workaround for datalad/datalad#7822, +# since fixed upstream). +# TODO research which datalad version shipped the datalad/datalad#7822 fix; +# once babs's minimum supported datalad is at or above it, fold this removal +# into the datalad run above and drop this step. +git rm -rf -q --sparse "${path}" +git commit -m "Remove ${path} for ${ZIP_ID} (zipped)" diff --git a/babs/templates/hooks/zip.sh.jinja2 b/babs/templates/hooks/zip.sh.jinja2 deleted file mode 100644 index 99685386..00000000 --- a/babs/templates/hooks/zip.sh.jinja2 +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# Built-in babs `zip` hook: archive a BIDS App output folder, commit the -# archive, and remove the granular outputs it replaces. -# -# Rendered once at `babs init` into `code/hooks/zip.sh`. Runs at the -# `post_run` splice point (cwd = the job's dataset clone) as a separate -# process: `subid` (and `sesid` at session level) arrive via the exported -# splice-point contract. -set -e -u -x - -{# `path` is the folder to zip, e.g. `outputs/fmriprep_minimal-25-2-5`. #} -{# The archive is written to the dataset root and contains the folder #} -{# itself (not its parents), matching the layout of babs zips to date. #} -{% set zip_folder = path.rsplit('/', 1)[-1] %} -{% set zip_dir = path.rsplit('/', 1)[0] if '/' in path else '.' %} -{% set updirs = '../' * (zip_dir.count('/') + 1) if zip_dir != '.' else '' %} -# subid/sesid are exported by the splice-point subshell in participant_job.sh -# shellcheck disable=SC2154 -{% if processing_level == 'session' %} -ZIP_ID="${subid}_${sesid}" -{% else %} -ZIP_ID="${subid}" -{% endif %} -ZIP_NAME="${ZIP_ID}_{{ zip_folder }}.zip" - -# Zip real file content, not annex symlinks: -datalad unlock "{{ path }}" - -datalad run \ - --explicit \ - --output "${ZIP_NAME}" \ - -m "Zip {{ path }} for ${ZIP_ID}" \ - -- \ - "cd {{ zip_dir }} && 7z a {{ updirs }}${ZIP_NAME} {{ zip_folder }}" - -# `datalad run --explicit` does not track deletions, so the granular outputs -# are removed in a separate commit (workaround for datalad/datalad#7822, -# since fixed upstream). -# TODO research which datalad version shipped the datalad/datalad#7822 fix; -# once babs's minimum supported datalad is at or above it, fold this removal -# into the datalad run above and drop this step. -git rm -rf -q --sparse "{{ path }}" -git commit -m "Remove {{ path }} for ${ZIP_ID} (zipped)" diff --git a/babs/utils.py b/babs/utils.py index b4fc01ab..13b929e3 100644 --- a/babs/utils.py +++ b/babs/utils.py @@ -238,6 +238,57 @@ def app_output_settings_from_config(config): return config['zip_foldernames'], bids_app_output_dir +def output_dir_from_config(config): + """ + Get the BIDS App output directory from the top-level `output_dir` key. + + Single-app mode only; pipeline mode still derives its folders from + `zip_foldernames` (via `app_output_settings_from_config`) and goes away + with pipeline mode. `output_dir` is the folder the BIDS App writes into, + relative to the dataset root, carrying the full versioned derivative name + (e.g. `outputs/fmriprep-24-1-1`). It is the single source for the app + write dir, the `datalad run` output declaration, and the default folder + the built-in `zip` hook archives. + + Parameters + ---------- + config: dictionary + attribute `config` in class Container; + + Returns + ------- + output_dir: str + the normalized `output_dir` value. + + Raises + ------ + ValueError + if the removed `zip_foldernames` / `all_results_in_one_zip` keys are + present, or `output_dir` is missing or not a relative in-dataset path. + """ + legacy = [key for key in ('zip_foldernames', 'all_results_in_one_zip') if key in config] + if legacy: + raise ValueError( + f'Config key(s) {legacy} are no longer supported. Declare the BIDS App' + " output folder with a top-level `output_dir` (e.g. 'outputs/fmriprep-24-1-1')" + ' and configure zipping as a post_run hook:' + ' `hooks: {post_run: [{builtin: zip}]}`.' + ) + output_dir = config.get('output_dir') + if not output_dir or not isinstance(output_dir, str): + raise ValueError( + 'The container config needs a top-level `output_dir`: the folder the' + ' BIDS App writes into, relative to the dataset root' + " (e.g. 'outputs/fmriprep-24-1-1')." + ) + normalized = os.path.normpath(output_dir) + if os.path.isabs(normalized) or normalized.startswith('..') or normalized == '.': + raise ValueError( + f'`output_dir` must be a relative path inside the dataset, got {output_dir!r}.' + ) + return normalized + + def get_username(): """ Get the current username. diff --git a/docs/preparation_config_yaml_file.rst b/docs/preparation_config_yaml_file.rst index eea66cfc..f8ba77cf 100644 --- a/docs/preparation_config_yaml_file.rst +++ b/docs/preparation_config_yaml_file.rst @@ -26,8 +26,7 @@ Sections in the configuration YAML file * **singularity_args**: the arguments for ``singularity run``; * **bids_app_args**: the arguments for the BIDS App; * **imported_files**: the files to be copied into the datalad dataset; -* **all_results_in_one_zip**: whether to zip all results in one zip file; -* **zip_foldernames**: the results foldername(s) to be zipped; +* **output_dir**: the folder the BIDS App writes its results into; * **required_files**: to only keep subjects (sessions) that have this list of required files in input dataset(s); * **alert_log_messages**: alert messages in the log files that may be helpful for debugging errors in failed jobs; @@ -162,6 +161,8 @@ It also means that the ``recon_spec.yaml`` file will be tracked by datalad. **Important**: If you are importing a large file this mechanism will not work. +.. _section-hooks: + Section ``hooks`` ================= @@ -186,7 +187,6 @@ Example section **hooks** post_run: - script: "/path/to/validate-outputs.sh" - builtin: zip # a built-in shipped with BABS - path: outputs/fmriprep_minimal-25-2-5 Three entry forms are supported: @@ -197,22 +197,33 @@ Three entry forms are supported: (copied into the project the same way as ``imported_files``). BABS copies it to ``code/hooks/.sh`` at ``babs init`` and the splice runs ``bash ./code/hooks/.sh`` — a **separate process**. -- **built-in** — ``{builtin: }``. The hook ships with BABS as a template; - ``babs init`` renders it into ``code/hooks/.sh`` (git-tracked, so you can - read exactly what will run) and the splice runs it like a script hook. Keys - beyond ``builtin`` are parameters for that built-in. +- **built-in** — ``{builtin: }``. The hook ships with BABS; ``babs init`` + copies it into ``code/hooks/.sh`` (git-tracked, so you can read exactly + what will run) and the splice runs it like a script hook. Keys beyond + ``builtin`` are parameters for that built-in, passed as **arguments** at the + splice site — several instances of the same built-in (e.g. several zip hooks) + share one script and differ only in their arguments. Built-in: ``zip`` ----------------- -``{builtin: zip, path: }`` archives one output folder as a ``post_run`` -hook. ``path`` is the folder to zip, relative to the dataset root (e.g. -``outputs/fmriprep_minimal-25-2-5``). The hook zips it into -``${subid}[_${sesid}]_.zip`` at the dataset root inside its **own** -``datalad run`` (so the archive is committed with provenance), then removes the -granular folder in a follow-up commit. The archive contains the folder itself -(e.g. ``fmriprep_minimal-25-2-5/``) at its top level. To produce several -separate archives, list several zip hooks, each with its own ``path``. +``{builtin: zip}`` archives one output folder as a ``post_run`` hook. Its two +optional parameters: + +- ``path`` — the folder to zip, relative to the dataset root. Defaults to the + top-level ``output_dir``, so the argless form covers the common case of + archiving the whole app output. +- ``name`` — the archive-name stem: the hook zips ``path`` into + ``${subid}[_${sesid}]_.zip`` at the dataset root. Defaults to + ``path``'s basename. Set it when the BIDS App controls the folder name and + you want a versioned archive name (e.g. ``path: outputs/freesurfer`` with + ``name: freesurfer-24-1-1``). + +The hook zips inside its **own** ``datalad run`` (so the archive is committed +with provenance), then removes the granular folder in a follow-up commit. The +archive contains the folder itself (e.g. ``fmriprep_minimal-25-2-5/``) at its +top level. To produce several separate archives, list several zip hooks, each +with its own ``path``. The runtime contract -------------------- @@ -487,139 +498,86 @@ Advanced - Manual of writing section ``bids_app_args`` .. `notebooks/inDev_*.yaml` in `babs_tests` repo: done -Section ``zip_foldernames`` -=========================== - -This section defines the name(s) of the expected output folder(s). -BABS will zip those folder(s) into separate zip file(s). - -Here we provide two examples. :ref:`Example #1 ` -is for regular use cases, -where the BIDS App will generate one or several folders that wrap all derivative files. -Example use cases are ``fMRIPrep`` with legacy output layout, as well as ``QSIPrep`` and ``XCP-D``. - -If the BIDS App won't generate one or several folders that wrap all derivative files, -users should ask BABS to create a folder as an extra layer by specifying ``all_results_in_one_zip: true``. -We explain how to do so in :ref:`Example #2 `. -An example use case is ``fMRIPrep`` with BIDS output layout. - +Section ``output_dir`` +====================== -.. _example_zip_foldernames_for_fmriprep_legacy_output_layout: +``output_dir`` is the folder the BIDS App writes its results into, relative to +the dataset root. It carries the full versioned derivative name (e.g. +``outputs/fmriprep-24-1-1``) and is the single source for that name: the same +string is the app's write directory, the ``datalad run`` output declaration, +and the default folder the built-in :ref:`zip hook ` archives. -Example #1: for ``fMRIPrep`` *legacy* output layout ---------------------------------------------------- +Note that ``output_dir`` only *declares* the folder and commits whatever the +app writes there -- zipping is opt-in, configured as a ``post_run`` zip hook +(see Section ``hooks`` above). No zip hook = results are pushed as granular +files. -Here we use ``fMRIPrep`` (*legacy* output layout) as an example to show you -how to write this ``zip_foldernames`` section. For this case, all derivative files -are wrapped in folders generated by fMRIPrep. Similar use cases are ``QSIPrep`` -(e.g., generating a folder called ``qsiprep``), and ``XCP-D`` (generating a folder called ``xcp_d``). +Which folder to point it at depends on the BIDS App's output layout: -Older versions of ``fMRIPrep`` (version < 21.0) generate -`legacy output layout `_ -which looks like below:: +Example #1: the BIDS App writes directly into the output directory +------------------------------------------------------------------- - / - fmriprep/ - freesurfer/ - -In this case, ``fMRIPrep`` generates two folders, ``fmriprep`` and ``freesurfer``, -which include all derivatives. Therefore, we can directly tell BABS the expected foldernames, -without asking BABS to create them. - -Example section **zip_foldernames** for ``fMRIPrep`` *legacy* output layout: +Recent ``fMRIPrep`` (version >= 21.0) uses +`BIDS output layout `_: +it writes files like ``sub-