diff --git a/babs/bootstrap.py b/babs/bootstrap.py index 3827bff6..3fdaacf6 100644 --- a/babs/bootstrap.py +++ b/babs/bootstrap.py @@ -13,11 +13,13 @@ from babs.base import BABS from babs.container import Container +from babs.hooks import CopyIn, Render, resolve_hooks from babs.input_datasets import InputDatasets from babs.status import create_initial_statuses, write_job_status_csv from babs.system import System, validate_queue from babs.utils import ( get_datalad_version, + output_dir_from_config, validate_processing_level, ) @@ -270,8 +272,24 @@ def babs_bootstrap( ) container = Container(container_ds, container_name, container_config) - # Copy in any other files needed: - self._init_import_files(container.config.get('imported_files', [])) + # Materialize hooks (and copy in any other files needed). Built-in + # (Render) hooks are rendered from shipped templates; script (CopyIn) + # hooks reuse the imported_files path. Both destinations are relative + # to self.analysis_path, so they survive a configurable analysis_path. + # Pipeline configs have no `hooks:` block, so this is a no-op there + # (and output_dir -- which would hard-error on a pipeline config's + # legacy zip keys -- is only derived when hooks are configured). + hooks_config = container.config.get('hooks') + _, _, hook_materializations = resolve_hooks( + hooks_config, + output_dir=output_dir_from_config(container.config) if hooks_config else None, + ) + self._init_render_hooks([m for m in hook_materializations if isinstance(m, Render)]) + copy_ins = [m for m in hook_materializations if isinstance(m, CopyIn)] + self._init_import_files( + container.config.get('imported_files', []) + + [m.as_import() for m in copy_ins] + ) # _update_inclusion_dataframe() expects a DataFrame (or None). # If --list_sub_file was provided, use the parsed DataFrame # stored in initial_inclu_df by set_inclusion_dataframe() above. @@ -412,12 +430,12 @@ def _bootstrap_single_app_scripts( """Bootstrap scripts for single BIDS app configuration.""" container = Container(container_ds, container_name, container_config) - # Generate `_zip.sh`: ---------------------------------- - # which is a bash script of singularity run + zip + # Generate `_run.sh`: ---------------------------------- + # which is a bash script of singularity run # in folder: `analysis/code` - print('\nGenerating a bash script for running container and zipping the outputs...') - print('This bash script will be named as `' + container_name + '_zip.sh`') - bash_path = op.join(self.analysis_path, 'code', container_name + '_zip.sh') + print('\nGenerating a bash script for running the container...') + print('This bash script will be named as `' + container_name + '_run.sh`') + bash_path = op.join(self.analysis_path, 'code', container_name + '_run.sh') shared_group_mode = self.shared_group is not None container.generate_bash_run_bidsapp( bash_path, @@ -426,7 +444,7 @@ def _bootstrap_single_app_scripts( shared_group_mode=shared_group_mode, ) self.datalad_save( - path='code/' + container_name + '_zip.sh', + path='code/' + container_name + '_run.sh', message='Generate script of running container', ) @@ -564,6 +582,9 @@ def _init_import_files(self, file_list): f'Requested imported file {imported_file["original_path"]} does not exist.' ) imported_location = op.join(self.analysis_path, imported_file['analysis_path']) + # Create the destination's parent dir if needed (e.g. hooks land in + # `code/hooks/`, which doesn't pre-exist like flat `code/` does). + os.makedirs(op.dirname(imported_location), exist_ok=True) # Copy the file using pure Python: with ( open(imported_file['original_path'], 'rb') as src, @@ -582,6 +603,42 @@ def _init_import_files(self, file_list): message='Import files', ) + def _init_render_hooks(self, renders): + """ + Render built-in hook templates into ``code/hooks/`` and datalad save. + + Parameters + ---------- + renders: list of babs.hooks.Render + Built-in hooks to materialize. Each is rendered from its shipped + template (``babs/templates/hooks/``) with a context of the per-hook + config params plus the derived part supplied here + (``processing_level``). + """ + if not renders: + return + env = Environment( + loader=PackageLoader('babs', 'templates'), + trim_blocks=True, + lstrip_blocks=True, + autoescape=False, + undefined=StrictUndefined, + ) + rendered_files = [] + for render in renders: + # Derived values win; a per-hook config param can't override them. + context = {**render.context, 'processing_level': self.processing_level} + content = env.get_template(render.template_path).render(**context) + destination = op.join(self.analysis_path, render.analysis_path) + os.makedirs(op.dirname(destination), exist_ok=True) + with open(destination, 'w') as f: + f.write(content) + rendered_files.append(render.analysis_path) + self.datalad_save( + path=rendered_files, + message='Materialize built-in hook scripts', + ) + def clean_up(self): """ If `babs init` failed, this function cleans up the BABS project `babs init` creates. diff --git a/babs/check_setup.py b/babs/check_setup.py index 86bf9dbd..d332865c 100644 --- a/babs/check_setup.py +++ b/babs/check_setup.py @@ -156,7 +156,7 @@ def babs_check_setup(self, submit_a_test_job): else: list_files_code = [ 'babs_proj_config.yaml', - container_name + '_zip.sh', + container_name + '_run.sh', 'participant_job.sh', 'submit_job_template.yaml', ] diff --git a/babs/container.py b/babs/container.py index 4424b833..ee6e199b 100644 --- a/babs/container.py +++ b/babs/container.py @@ -8,7 +8,8 @@ from babs.generate_bidsapp_runscript import generate_bidsapp_runscript from babs.generate_submit_script import generate_submit_script, generate_test_submit_script -from babs.utils import app_output_settings_from_config +from babs.hooks import resolve_hooks +from babs.utils import output_dir_from_config class Container: @@ -127,16 +128,12 @@ def generate_bash_run_bidsapp( input_datasets = input_ds.as_records() templateflow_home = os.getenv('TEMPLATEFLOW_HOME') - # What should the outputs look like? - dict_zip_foldernames, bids_app_output_dir = app_output_settings_from_config(self.config) - script_content = generate_bidsapp_runscript( input_datasets, processing_level, container_name=self.container_name, relative_container_path=self.container_path_relToAnalysis, - bids_app_output_dir=bids_app_output_dir, - dict_zip_foldernames=dict_zip_foldernames, + bids_app_output_dir=output_dir_from_config(self.config), bids_app_args=self.config.get('bids_app_args', None), singularity_args=self.config.get('singularity_args', []), templateflow_home=templateflow_home, @@ -178,6 +175,10 @@ def generate_bash_participant_job( If True, align generated script permissions with shared-group mode. """ + output_dir = output_dir_from_config(self.config) + hook_pre_run, hook_post_run, _ = resolve_hooks( + self.config.get('hooks'), output_dir=output_dir + ) script_content = generate_submit_script( queue_system=system.type, cluster_resources_config=self.config['cluster_resources'], @@ -186,8 +187,10 @@ def generate_bash_participant_job( input_datasets=input_ds.as_records(), processing_level=processing_level, container_name=self.container_name, - zip_foldernames=self.config['zip_foldernames'], + output_dir=output_dir, project_root=project_root, + hook_pre_run=hook_pre_run, + hook_post_run=hook_post_run, ) with open(bash_path, 'w') as f: diff --git a/babs/generate_bidsapp_runscript.py b/babs/generate_bidsapp_runscript.py index a24db5fc..0df018a1 100644 --- a/babs/generate_bidsapp_runscript.py +++ b/babs/generate_bidsapp_runscript.py @@ -14,7 +14,6 @@ def generate_bidsapp_runscript( container_name, relative_container_path, bids_app_output_dir, - dict_zip_foldernames, bids_app_args=None, singularity_args=None, templateflow_home=None, @@ -43,7 +42,7 @@ def generate_bidsapp_runscript( The contents of the bash script that runs the BIDS App singularity image. """ - from .constants import OUTPUT_MAIN_FOLDERNAME, PATH_FS_LICENSE_IN_CONTAINER + from .constants import PATH_FS_LICENSE_IN_CONTAINER # 1. check `bids_app_args` section: if bids_app_args is None: @@ -73,9 +72,6 @@ def generate_bidsapp_runscript( # Get unzip commands for any zipped input datasets cmd_unzip_inputds = get_input_unzipping_cmds(input_datasets) - # Generate zip command - cmd_zip = get_output_zipping_cmds(dict_zip_foldernames, processing_level) - # Render the template env = Environment( loader=PackageLoader('babs', 'templates'), @@ -101,8 +97,6 @@ def generate_bidsapp_runscript( bids_app_input_dir=bids_app_input_dir, bids_app_output_dir=bids_app_output_dir, bids_app_args=bids_app_args, - cmd_zip=cmd_zip, - OUTPUT_MAIN_FOLDERNAME=OUTPUT_MAIN_FOLDERNAME, singularity_flags=singularity_args, subject_selection_flag=subject_selection_flag, ) diff --git a/babs/generate_submit_script.py b/babs/generate_submit_script.py index 8242a4e2..f88a0f0c 100644 --- a/babs/generate_submit_script.py +++ b/babs/generate_submit_script.py @@ -25,11 +25,14 @@ def generate_submit_script( input_datasets, processing_level, container_name, - zip_foldernames, + zip_foldernames=None, + output_dir=None, run_script_relpath=None, container_images=None, datalad_run_message=None, project_root=None, + hook_pre_run=None, + hook_post_run=None, ): """ Generate a bash script that runs the BIDS App singularity image. @@ -50,8 +53,13 @@ def generate_submit_script( Processing level ('subject' or 'session'). container_name : str Name of the container. - zip_foldernames : dict - Dictionary mapping output names to versions. + zip_foldernames : dict, optional + Dictionary mapping output names to versions. Pipeline mode only: the + ``datalad run`` declares the corresponding per-subject zips as outputs. + output_dir : str, optional + Single-app mode: the app output folder, declared as the ``datalad run`` + output (the run commits granular outputs; zipping is a ``post_run`` + hook). Mutually exclusive with ``zip_foldernames``. run_script_relpath : str, optional Path to script executed by datalad run. None for single-app mode. container_images : list, optional @@ -62,6 +70,12 @@ def generate_submit_script( Absolute path to the BABS project root (parent of `analysis/`). Passed to the template; used in the error message when PROJECT_ROOT is unset. If None, the placeholder ``{project_root}`` is shown. + hook_pre_run : list of str, optional + Shell snippets spliced into a subshell just before the ``datalad run`` + wrapper. None (or empty) renders nothing. Snippets are emitted in order. + hook_post_run : list of str, optional + Shell snippets spliced into a subshell just after the ``datalad run`` + wrapper, before the push. None (or empty) renders nothing. Returns ------- @@ -119,6 +133,7 @@ def generate_submit_script( zip_locator_text=zip_locator_text, container_name=container_name, zip_foldernames=zip_foldernames, + output_dir=output_dir, varname_jobid=varname_jobid, varname_taskid=varname_taskid, input_datasets=input_datasets, @@ -129,6 +144,8 @@ def generate_submit_script( container_image_paths=container_image_paths, datalad_run_message=datalad_run_message, project_root=project_root, + hook_pre_run=hook_pre_run, + hook_post_run=hook_post_run, ) diff --git a/babs/hooks.py b/babs/hooks.py new file mode 100644 index 00000000..240ccb2e --- /dev/null +++ b/babs/hooks.py @@ -0,0 +1,304 @@ +"""Resolve a `hooks:` config block into spliceable commands + materializations. + +A BABS container config may carry a top-level ``hooks:`` block with +``pre_run:`` / ``post_run:`` lists. Each entry names a snippet of work to run +at the corresponding splice point in ``participant_job.sh`` (added in the +splice-points step). This module turns those config entries into: + +- the **runtime command strings** spliced into the job script, and +- the **file materializations** BABS must perform at ``babs init`` (copying a + user script into ``code/hooks/.sh``). + +`resolve_hooks` is **pure** (config in -> commands + descriptors out, no I/O); +the caller (`Bootstrap`) performs the materializations. + +Entry forms supported in this version: + +- **snippet** -- a bare string, spliced verbatim (``Verbatim``). +- **script** -- ``{script: }``; copied into + ``code/hooks/.sh`` and invoked as ``bash ./code/hooks/.sh`` + (``CopyIn``). ```` is an absolute local path used verbatim -- the same + convention as ``imported_files.original_path``, which this reuses. The + destination name is the source basename. The *same* script may appear at + multiple splice points (e.g. a validator at ``pre_run`` and ``post_run``) -- + it is copied once and referenced from each. Two *different* sources sharing a + basename collide (no name override yet -- add one if needed). +- **built-in** -- ``{builtin: }``; the hook ships inside the babs + package. A *static* built-in (``templates/hooks/.sh``) is copied in + like a script hook (``CopyIn``) and its per-hook params become shell + **arguments** at the splice site, with defaults resolved at config time + (e.g. zip's ``path`` defaults to the top-level ``output_dir``) -- so several + instances share one script and differ only in args. A *templated* built-in + (``templates/hooks/.sh.jinja2``) is rendered at init (``Render``); + none ship yet -- the container-running built-ins (NORDIC) are its first + real consumer. ``zip`` is the first built-in, and it is static. + +The **container-running templated built-in** (a babs-shipped ``singularity run`` +template parameterised by per-hook ``singularity_args``/``bids_app_args``) also +lands as a ``Render``, but is still deferred -- ``{container: ...}`` entries are +rejected for now. User-provided jinja templates are intentionally *not* +supported: a template is inert without babs code to populate its context. +""" + +import os.path as op +import shlex +from dataclasses import dataclass, field + +# Where copied/rendered hook scripts land inside the analysis dataset. +HOOKS_SUBDIR = op.join('code', 'hooks') + +# Where the packaged built-in hook files live. +BUILTIN_SOURCE_DIR = op.join(op.dirname(__file__), 'templates', 'hooks') + +# The splice points a hooks config may target, in run order. +SPLICE_POINTS = ('pre_run', 'post_run') + +# Known built-ins and the per-hook params each accepts. Also the registry of +# valid `{builtin: }` names, so a typo'd name or param fails at +# resolve time. STATIC_BUILTINS names the ones shipped as plain `.sh` (copied +# in; params become splice-site arguments); the rest are `.sh.jinja2` +# templates (rendered at init; params become render context) -- none yet. +BUILTIN_PARAMS = {'zip': {'path', 'name'}} +STATIC_BUILTINS = {'zip'} + + +@dataclass(frozen=True) +class Verbatim: + """Snippet hook: a raw shell snippet spliced inline. No file materialization.""" + + command: str + + +@dataclass(frozen=True) +class CopyIn: + """Script hook (or static built-in): copy ``original_path`` -> ``code/hooks/.sh``.""" + + original_path: str + name: str + + @property + def analysis_path(self): + """Destination relative to the analysis dataset root.""" + return op.join(HOOKS_SUBDIR, f'{self.name}.sh') + + def as_import(self): + """Shape consumed by ``Bootstrap._init_import_files``.""" + return {'original_path': self.original_path, 'analysis_path': self.analysis_path} + + +@dataclass(frozen=True) +class Render: + """Templated built-in: render ``template_path`` -> ``code/hooks/.sh``. + + ``template_path`` is loader-relative (``PackageLoader('babs', 'templates')``). + ``context`` is the config-derived part (per-hook params); bootstrap merges in + the top-level/derived part (e.g. ``output_dir``) before rendering. Produced for + ``{builtin: }`` entries; the ``{container: ...}`` scaffold is still deferred. + """ + + template_path: str + name: str + context: dict = field(default_factory=dict) + + @property + def analysis_path(self): + """Destination relative to the analysis dataset root.""" + return op.join(HOOKS_SUBDIR, f'{self.name}.sh') + + +def _validate_name(name, entry): + """Reject a hook name that would escape ``code/hooks/`` or is empty.""" + if not name or name in ('.', '..') or op.basename(name) != name: + raise ValueError( + f'Invalid hook name {name!r} from entry {entry!r}: a hook name must be ' + 'a bare filename (no path separators, not "." or "..").' + ) + + +def _default_name(source): + """Derive a hook name from a source path: basename with a trailing ``.sh`` dropped.""" + base = op.basename(source) + return base[:-3] if base.endswith('.sh') else base + + +def _builtin_args(name, entry, output_dir): + """Resolve a static built-in's params into its splice-site arguments. + + zip: `` []``. ``path`` defaults to the config's top-level + ``output_dir`` (the argless common case: archive the whole app output); + ``name`` is only passed when configured (the script defaults it to + ``path``'s basename). + """ + path = entry.get('path', output_dir) + if not path: + raise ValueError( + f'The {name!r} built-in needs a folder to archive: give it a ' + '`path` param or set the top-level `output_dir`.' + ) + return (path,) if 'name' not in entry else (path, entry['name']) + + +def _resolve_entry(entry, output_dir=None): + """Classify a single config entry into a materialization mode + arguments. + + Parameters + ---------- + entry : str or dict + One item from a ``pre_run:`` / ``post_run:`` list. + output_dir : str or None + The config's top-level ``output_dir``; defaults static built-in args + (zip's ``path``). + + Returns + ------- + mode : Verbatim, CopyIn, or Render + The classified mode. + args : tuple of str + Splice-site arguments for the hook's command (static built-ins only; + empty otherwise). Args belong to the *command*, not the + materialization: several instances of one static built-in share a + single materialized file and differ only in args. + """ + if isinstance(entry, str): + return Verbatim(command=entry), () + + if isinstance(entry, dict) and 'script' in entry: + unknown = set(entry) - {'script'} + if unknown: + raise ValueError( + f'Unsupported key(s) {sorted(unknown)} in script hook entry {entry!r}; ' + 'expected only "script".' + ) + source = entry['script'] + # The destination name is derived from the source basename; there is no + # override yet, so two sources with the same basename collide (see below). + name = _default_name(source) + _validate_name(name, entry) + # `source` is used verbatim as the copy source -- same convention as + # `imported_files.original_path` (an absolute local path; resolved by + # `_init_import_files`, which raises FileNotFoundError on a bad path). + return CopyIn(original_path=source, name=name), () + + if isinstance(entry, dict) and 'builtin' in entry: + name = entry['builtin'] + if name not in BUILTIN_PARAMS: + raise ValueError( + f'Unknown built-in hook {name!r}; available: {sorted(BUILTIN_PARAMS)}.' + ) + unknown = set(entry) - {'builtin'} - BUILTIN_PARAMS[name] + if unknown: + raise ValueError( + f'Unsupported key(s) {sorted(unknown)} for built-in hook {name!r}; ' + f'it accepts {sorted(BUILTIN_PARAMS[name])}.' + ) + # A static built-in is copied in like a script hook; its params become + # splice-site arguments (resolved here, at config time, so defaults + # like zip's path <- output_dir are visible in participant_job.sh). + if name in STATIC_BUILTINS: + source = op.join(BUILTIN_SOURCE_DIR, f'{name}.sh') + return CopyIn(original_path=source, name=name), _builtin_args( + name, entry, output_dir + ) + # A templated built-in renders at init; keys beyond `builtin` are the + # *config-derived* part of its render context, and bootstrap merges in + # the top-level/derived part (e.g. `processing_level`) at render time. + # The template path is loader-relative (PackageLoader('babs', + # 'templates')). None ship yet (zip is static); the container-running + # built-ins (PR 3) land here. + context = {k: v for k, v in entry.items() if k != 'builtin'} + return Render(template_path=f'hooks/{name}.sh.jinja2', name=name, context=context), () + + raise ValueError( + f'Unsupported hook entry: {entry!r}. This version supports a raw shell ' + 'string, a {script: } mapping, or a {builtin: } mapping.' + ) + + +def _command_for(mode, args=()): + """The runtime command string a classified mode contributes to its splice list.""" + if isinstance(mode, Verbatim): + return mode.command + if isinstance(mode, (CopyIn, Render)): + command = f'bash ./{mode.analysis_path}' + if args: + command += ' ' + ' '.join(shlex.quote(str(arg)) for arg in args) + return command + raise TypeError(f'Unknown hook mode: {mode!r}') + + +def resolve_hooks(hooks_config, output_dir=None): + """Resolve a ``hooks:`` config block into spliceable commands + materializations. + + Parameters + ---------- + hooks_config : dict or None + The top-level ``hooks:`` block (``{'pre_run': [...], 'post_run': [...]}``), + or ``None`` when no hooks are configured. + output_dir : str or None + The config's top-level ``output_dir``; used to default static + built-in arguments (the argless ``{builtin: zip}`` archives it). + Only consulted when such a hook is configured. + + Returns + ------- + pre_run : list of str + Command strings to splice at the ``pre_run`` point, in order. + post_run : list of str + Command strings to splice at the ``post_run`` point, in order. + materializations : list of (CopyIn or Render) + Files BABS must copy/render into ``code/hooks/`` at ``babs init``. + `Verbatim` entries contribute no materialization. + + Raises + ------ + ValueError + On an unknown splice-point key, an unsupported entry form, an invalid + hook name, or two *different* hooks resolving to the same name in the + (shared) ``code/hooks/`` namespace. The identical hook reused at + multiple splice points is allowed (materialized once). + """ + if hooks_config is None: + return [], [], [] + if not isinstance(hooks_config, dict): + raise ValueError( + f'`hooks:` must be a mapping with keys {list(SPLICE_POINTS)}, ' + f'got {hooks_config!r}.' + ) + unknown_points = set(hooks_config) - set(SPLICE_POINTS) + if unknown_points: + raise ValueError( + f'Unknown hook splice point(s) {sorted(unknown_points)}; ' + f'expected only {list(SPLICE_POINTS)}.' + ) + + resolved = {point: [] for point in SPLICE_POINTS} + materializations = [] + seen = {} # name -> (descriptor, splice point that first claimed it) + + for point in SPLICE_POINTS: + for entry in hooks_config.get(point) or []: + mode, args = _resolve_entry(entry, output_dir) + resolved[point].append(_command_for(mode, args)) + if isinstance(mode, (CopyIn, Render)): + # Collision is about the materialized file, not the name alone: + # the *same* hook used at multiple splice points (identical + # descriptor) is fine -- copy/render once, reference it from + # each list. Only a *different* descriptor claiming the same + # name is a real conflict (one would clobber the other). For a + # Render this includes a differing `context` -- same template + # rendered two ways into one file collides. + if mode.name in seen: + prior, prior_point = seen[mode.name] + if mode == prior: + continue # same file; already materialized + where = ( + repr(point) if prior_point == point else f'{prior_point!r} and {point!r}' + ) + raise ValueError( + f'Duplicate hook name {mode.name!r} ({where}): two different ' + f'hooks both resolve to {op.join(HOOKS_SUBDIR, mode.name + ".sh")!r}.' + ) + seen[mode.name] = (mode, point) + materializations.append(mode) + + return resolved['pre_run'], resolved['post_run'], materializations diff --git a/babs/templates/bidsapp_run.sh.jinja2 b/babs/templates/bidsapp_run.sh.jinja2 index 12b5da0e..1089bd84 100644 --- a/babs/templates/bidsapp_run.sh.jinja2 +++ b/babs/templates/bidsapp_run.sh.jinja2 @@ -70,8 +70,7 @@ singularity run \ {% endfor %} {{ subject_selection_flag }} "${subid}" -{{ cmd_zip }} -rm -rf {{ OUTPUT_MAIN_FOLDERNAME }} .git/tmp/wkdir +rm -rf .git/tmp/wkdir {% if flag_filterfile %} rm -f "${filterfile}" {% endif %} \ No newline at end of file diff --git a/babs/templates/hooks/zip.sh b/babs/templates/hooks/zip.sh new file mode 100644 index 00000000..7e756a05 --- /dev/null +++ b/babs/templates/hooks/zip.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Built-in babs `zip` hook: archive a BIDS App output folder, commit the +# archive, and remove the granular outputs it replaces. +# +# Copied verbatim at `babs init` into `code/hooks/zip.sh`. Runs at the +# `post_run` splice point (cwd = the job's dataset clone) as a separate +# process: `subid` (and `sesid` at session level) arrive via the exported +# splice-point contract; what to zip arrives as arguments: +# +# zip.sh [] +# +# is the folder to zip, relative to the dataset root. is the +# archive-name stem (the X in ${subid}[_${sesid}]_X.zip), defaulting to +# 's basename. The archive is written to the dataset root and contains +# the folder itself (not its parents), matching the layout of babs zips to +# date. +set -e -u -x + +path="$1" +name="${2:-$(basename "$path")}" +zip_dir="$(dirname "$path")" +zip_folder="$(basename "$path")" + +# subid is exported by the splice-point subshell in participant_job.sh; +# sesid only at session level, so its presence encodes the processing level. +# shellcheck disable=SC2154 +ZIP_ID="${subid}${sesid:+_${sesid}}" +ZIP_NAME="${ZIP_ID}_${name}.zip" + +# Zip real file content, not annex symlinks: +datalad unlock "${path}" + +# cd into the parent so the archive contains the folder at its top level; +# OLDPWD (the dataset root) is where the archive lands. +datalad run \ + --explicit \ + --output "${ZIP_NAME}" \ + -m "Zip ${path} for ${ZIP_ID}" \ + -- \ + "cd ${zip_dir} && 7z a \"\${OLDPWD}/${ZIP_NAME}\" ${zip_folder}" + +# `datalad run --explicit` does not track deletions, so the granular outputs +# are removed in a separate commit (workaround for datalad/datalad#7822, +# since fixed upstream). +# TODO research which datalad version shipped the datalad/datalad#7822 fix; +# once babs's minimum supported datalad is at or above it, fold this removal +# into the datalad run above and drop this step. +git rm -rf -q --sparse "${path}" +git commit -m "Remove ${path} for ${ZIP_ID} (zipped)" diff --git a/babs/templates/participant_job.sh.jinja2 b/babs/templates/participant_job.sh.jinja2 index f3151cea..776b7b02 100644 --- a/babs/templates/participant_job.sh.jinja2 +++ b/babs/templates/participant_job.sh.jinja2 @@ -130,10 +130,22 @@ for CONTAINER_JOB in "${CONTAINER_IMAGE_PATHS[@]}"; do exit 1 fi done +{% if hook_pre_run %} +# pre_run hooks: spliced before the datalad run wrapper; subshell exports the contract. +( + export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR +{% if processing_level == 'session' %} + export sesid +{% endif %} +{% for snippet in hook_pre_run %} +{{ snippet }} +{% endfor %} +) +{% endif %} # datalad run: datalad run \ - -i "{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }}" \ + -i "{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_run.sh' }}" \ {% for input_dataset in input_datasets %} {% if not input_dataset['is_zipped'] %} -i "{{ input_dataset['unzipped_path_containing_subject_dirs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \ @@ -149,13 +161,28 @@ datalad run \ --expand inputs \ {% endif %} --explicit \ +{% if output_dir %} + -o "{{ output_dir }}" \ +{% endif %} {% if zip_foldernames is not none %} {% for key, value in zip_foldernames.items() %} -o "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}_{% raw %}${sesid}{% endraw %}{% endif %}_{{ key }}-{{ value }}.zip" \ {% endfor %} {% endif %} -m "{{ (datalad_run_message if datalad_run_message is defined and datalad_run_message else container_name) }} {% raw %}${subid}{% endraw %}{% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}" \ - "bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }} {% raw %}${subid}{% endraw %} {% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}" + "bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_run.sh' }} {% raw %}${subid}{% endraw %} {% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}" +{% if hook_post_run %} +# post_run hooks: spliced after the run, before push; subshell exports the contract. +( + export subid BRANCH PROJECT_ROOT JOB_SCRATCH_DIR +{% if processing_level == 'session' %} + export sesid +{% endif %} +{% for snippet in hook_post_run %} +{{ snippet }} +{% endfor %} +) +{% endif %} # Finish up: # push result file content to output RIA storage: diff --git a/babs/utils.py b/babs/utils.py index b4fc01ab..13b929e3 100644 --- a/babs/utils.py +++ b/babs/utils.py @@ -238,6 +238,57 @@ def app_output_settings_from_config(config): return config['zip_foldernames'], bids_app_output_dir +def output_dir_from_config(config): + """ + Get the BIDS App output directory from the top-level `output_dir` key. + + Single-app mode only; pipeline mode still derives its folders from + `zip_foldernames` (via `app_output_settings_from_config`) and goes away + with pipeline mode. `output_dir` is the folder the BIDS App writes into, + relative to the dataset root, carrying the full versioned derivative name + (e.g. `outputs/fmriprep-24-1-1`). It is the single source for the app + write dir, the `datalad run` output declaration, and the default folder + the built-in `zip` hook archives. + + Parameters + ---------- + config: dictionary + attribute `config` in class Container; + + Returns + ------- + output_dir: str + the normalized `output_dir` value. + + Raises + ------ + ValueError + if the removed `zip_foldernames` / `all_results_in_one_zip` keys are + present, or `output_dir` is missing or not a relative in-dataset path. + """ + legacy = [key for key in ('zip_foldernames', 'all_results_in_one_zip') if key in config] + if legacy: + raise ValueError( + f'Config key(s) {legacy} are no longer supported. Declare the BIDS App' + " output folder with a top-level `output_dir` (e.g. 'outputs/fmriprep-24-1-1')" + ' and configure zipping as a post_run hook:' + ' `hooks: {post_run: [{builtin: zip}]}`.' + ) + output_dir = config.get('output_dir') + if not output_dir or not isinstance(output_dir, str): + raise ValueError( + 'The container config needs a top-level `output_dir`: the folder the' + ' BIDS App writes into, relative to the dataset root' + " (e.g. 'outputs/fmriprep-24-1-1')." + ) + normalized = os.path.normpath(output_dir) + if os.path.isabs(normalized) or normalized.startswith('..') or normalized == '.': + raise ValueError( + f'`output_dir` must be a relative path inside the dataset, got {output_dir!r}.' + ) + return normalized + + def get_username(): """ Get the current username. diff --git a/docs/preparation_config_yaml_file.rst b/docs/preparation_config_yaml_file.rst index 767a95ac..f8ba77cf 100644 --- a/docs/preparation_config_yaml_file.rst +++ b/docs/preparation_config_yaml_file.rst @@ -26,8 +26,7 @@ Sections in the configuration YAML file * **singularity_args**: the arguments for ``singularity run``; * **bids_app_args**: the arguments for the BIDS App; * **imported_files**: the files to be copied into the datalad dataset; -* **all_results_in_one_zip**: whether to zip all results in one zip file; -* **zip_foldernames**: the results foldername(s) to be zipped; +* **output_dir**: the folder the BIDS App writes its results into; * **required_files**: to only keep subjects (sessions) that have this list of required files in input dataset(s); * **alert_log_messages**: alert messages in the log files that may be helpful for debugging errors in failed jobs; @@ -162,6 +161,92 @@ It also means that the ``recon_spec.yaml`` file will be tracked by datalad. **Important**: If you are importing a large file this mechanism will not work. +.. _section-hooks: + +Section ``hooks`` +================= + +This section is optional. It lets you splice your own shell commands into each +participant job at two **splice points** that bracket the BIDS App run: + +- ``pre_run`` — runs just **before** the ``datalad run`` that executes the BIDS App. +- ``post_run`` — runs just **after** that ``datalad run`` completes (its outputs + already committed), but **before** the results are pushed to the output RIA. + +Each splice point is a list; entries run in the order you list them. + +Example section **hooks** +------------------------- + +.. code-block:: yaml + + hooks: + pre_run: + - echo "starting ${subid}" # a raw shell snippet + - script: "/path/to/validate-inputs.sh" # a script copied into the project + post_run: + - script: "/path/to/validate-outputs.sh" + - builtin: zip # a built-in shipped with BABS + +Three entry forms are supported: + +- **snippet** — a bare string. It is spliced **verbatim** into the job script + and runs *inline*. You own its quoting and safety (this is shell injection by + the config author, by design). +- **script** — ``{script: }``. ```` is an **absolute** local path + (copied into the project the same way as ``imported_files``). BABS copies it to + ``code/hooks/.sh`` at ``babs init`` and the splice runs + ``bash ./code/hooks/.sh`` — a **separate process**. +- **built-in** — ``{builtin: }``. The hook ships with BABS; ``babs init`` + copies it into ``code/hooks/.sh`` (git-tracked, so you can read exactly + what will run) and the splice runs it like a script hook. Keys beyond + ``builtin`` are parameters for that built-in, passed as **arguments** at the + splice site — several instances of the same built-in (e.g. several zip hooks) + share one script and differ only in their arguments. + +Built-in: ``zip`` +----------------- + +``{builtin: zip}`` archives one output folder as a ``post_run`` hook. Its two +optional parameters: + +- ``path`` — the folder to zip, relative to the dataset root. Defaults to the + top-level ``output_dir``, so the argless form covers the common case of + archiving the whole app output. +- ``name`` — the archive-name stem: the hook zips ``path`` into + ``${subid}[_${sesid}]_.zip`` at the dataset root. Defaults to + ``path``'s basename. Set it when the BIDS App controls the folder name and + you want a versioned archive name (e.g. ``path: outputs/freesurfer`` with + ``name: freesurfer-24-1-1``). + +The hook zips inside its **own** ``datalad run`` (so the archive is committed +with provenance), then removes the granular folder in a follow-up commit. The +archive contains the folder itself (e.g. ``fmriprep_minimal-25-2-5/``) at its +top level. To produce several separate archives, list several zip hooks, each +with its own ``path``. + +The runtime contract +-------------------- + +Both splice points run with the working directory set to the cloned dataset and +with these variables **exported** (so a separate-process ``script`` hook can read +them): ``subid``; ``sesid`` (session-level processing only); ``BRANCH``; +``PROJECT_ROOT``; ``JOB_SCRATCH_DIR``. Each splice runs in a **subshell** under +``set -e``: a hook's ``cd`` or variable changes do **not** leak into the rest of +the job, and a non-zero exit **aborts the job**. + +Persisting hook output +---------------------- + +Hooks splice **outside** the ``datalad run`` wrapper, so their filesystem effects +are **not** captured by it. A ``post_run`` hook that merely writes files into the +dataset leaves **uncommitted** changes — ``datalad push`` ships committed content, +so those files are **not pushed**. The common, safe use is **validation that fails +the job**: a ``post_run`` validator that exits non-zero aborts the job *before* +the push, so bad results never leave the node. If a hook needs to **persist** +output, it must run its **own** ``datalad run``/``datalad save`` to commit it. + + Section ``bids_app_args`` ========================= Currently, BABS does not support using configurations of running a BIDS App @@ -413,139 +498,86 @@ Advanced - Manual of writing section ``bids_app_args`` .. `notebooks/inDev_*.yaml` in `babs_tests` repo: done -Section ``zip_foldernames`` -=========================== - -This section defines the name(s) of the expected output folder(s). -BABS will zip those folder(s) into separate zip file(s). - -Here we provide two examples. :ref:`Example #1 ` -is for regular use cases, -where the BIDS App will generate one or several folders that wrap all derivative files. -Example use cases are ``fMRIPrep`` with legacy output layout, as well as ``QSIPrep`` and ``XCP-D``. +Section ``output_dir`` +====================== -If the BIDS App won't generate one or several folders that wrap all derivative files, -users should ask BABS to create a folder as an extra layer by specifying ``all_results_in_one_zip: true``. -We explain how to do so in :ref:`Example #2 `. -An example use case is ``fMRIPrep`` with BIDS output layout. +``output_dir`` is the folder the BIDS App writes its results into, relative to +the dataset root. It carries the full versioned derivative name (e.g. +``outputs/fmriprep-24-1-1``) and is the single source for that name: the same +string is the app's write directory, the ``datalad run`` output declaration, +and the default folder the built-in :ref:`zip hook ` archives. +Note that ``output_dir`` only *declares* the folder and commits whatever the +app writes there -- zipping is opt-in, configured as a ``post_run`` zip hook +(see Section ``hooks`` above). No zip hook = results are pushed as granular +files. -.. _example_zip_foldernames_for_fmriprep_legacy_output_layout: +Which folder to point it at depends on the BIDS App's output layout: -Example #1: for ``fMRIPrep`` *legacy* output layout ---------------------------------------------------- +Example #1: the BIDS App writes directly into the output directory +------------------------------------------------------------------- -Here we use ``fMRIPrep`` (*legacy* output layout) as an example to show you -how to write this ``zip_foldernames`` section. For this case, all derivative files -are wrapped in folders generated by fMRIPrep. Similar use cases are ``QSIPrep`` -(e.g., generating a folder called ``qsiprep``), and ``XCP-D`` (generating a folder called ``xcp_d``). - -Older versions of ``fMRIPrep`` (version < 21.0) generate -`legacy output layout `_ -which looks like below:: - - / - fmriprep/ - freesurfer/ - -In this case, ``fMRIPrep`` generates two folders, ``fmriprep`` and ``freesurfer``, -which include all derivatives. Therefore, we can directly tell BABS the expected foldernames, -without asking BABS to create them. - -Example section **zip_foldernames** for ``fMRIPrep`` *legacy* output layout: +Recent ``fMRIPrep`` (version >= 21.0) uses +`BIDS output layout `_: +it writes files like ``sub-