From 433229cf8c297b4a6aba207fd2d66175607f9ce8 Mon Sep 17 00:00:00 2001 From: "Gabriel A. Devenyi" Date: Thu, 11 Jun 2026 12:47:57 -0400 Subject: [PATCH] Harden datatype/derivatives extraction in filename parser _libBIDSsh_parse_filename derived the datatype and derivatives pipeline with substring greps piped through head/awk. Two latent defects: - Substring false-match: `grep -oE "(anat|...|func|...)"` matched the datatype anywhere in the path, so e.g. `study_func_proj/sub-01/anat/...` was mis-detected as `func` instead of `anat`. The same flaw let a directory like `myderivatives/` be read as a derivatives dataset. - SIGPIPE fragility: `grep ... | head -1 || echo "NA"` can have grep killed by SIGPIPE when head closes the pipe; under `set -o pipefail` the pipeline then reports failure and silently falls back to `NA`. Replace both with anchored bash regex on the path (no subshells, no pipes): datatype is the file's immediate parent directory matched as a whole component `(^|/)$`; the derivatives pipeline is the component right after `(^|/)derivatives/`. generate_entity_patterns.sh now emits the anchored datatype regex to keep the generated block in sync. Add regression tests covering the substring false-match, emg detection, missing-datatype NA, derivatives extraction, and the `myderivatives` non-match. Suite: 9/9 pass. Co-Authored-By: Claude Opus 4.8 --- generate_entity_patterns.sh | 4 ++-- libBIDS.sh | 23 ++++++++++++++++++++--- test_libBIDS.sh | 31 +++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/generate_entity_patterns.sh b/generate_entity_patterns.sh index a3a1955..dc3fcc2 100755 --- a/generate_entity_patterns.sh +++ b/generate_entity_patterns.sh @@ -53,5 +53,5 @@ printf '@(%s)\n' "$(jq -r '.objects.extensions[].value' schema.json \ | paste -sd'|')" echo -echo "### datatype regex (for _libBIDSsh_parse_filename) ###" -printf '(%s)\n' "$(jq -r '.objects.datatypes | keys[]' schema.json | paste -sd'|')" +echo "### datatype regex (anchored to a whole path component; for _libBIDSsh_parse_filename) ###" +printf '(^|/)(%s)$\n' "$(jq -r '.objects.datatypes | keys[]' schema.json | paste -sd'|')" diff --git a/libBIDS.sh b/libBIDS.sh index 60e5bc8..1e0bcf6 100755 --- a/libBIDS.sh +++ b/libBIDS.sh @@ -245,11 +245,28 @@ _libBIDSsh_parse_filename() { # Store the full path and filename arr[path]=$(tr -s / <<<"${path}") + arr[extension]="${filename#*.}" - # Extract from schema + + # Datatype is the BIDS datatype directory directly containing the file (the + # final component of the file's directory). Anchor to a whole path component + # so an unrelated substring (e.g. "func" in "study_func/") cannot match. # ./generate_entity_patterns.sh -> datatype regex - arr[datatype]=$(grep -o -E "(anat|beh|dwi|eeg|emg|fmap|func|ieeg|meg|micr|motion|mrs|nirs|perf|pet|phenotype)" <<<"$(dirname "${path}")" | head -1 || echo "NA") - arr[derivatives]=$(grep -o 'derivatives/.*/' <<<"${path}" | awk -F/ '{print $2}' || echo "NA") + local dir + dir=$(dirname "${path}") + if [[ ${dir} =~ (^|/)(anat|beh|dwi|eeg|emg|fmap|func|ieeg|meg|micr|motion|mrs|nirs|perf|pet|phenotype)$ ]]; then + arr[datatype]="${BASH_REMATCH[2]}" + else + arr[datatype]="NA" + fi + + # Derivatives pipeline name is the path component immediately following a + # "derivatives/" directory; NA when the file is not under one. + if [[ ${path} =~ (^|/)derivatives/([^/]+)/ ]]; then + arr[derivatives]="${BASH_REMATCH[2]}" + else + arr[derivatives]="NA" + fi local name_no_ext="${filename%%.*}" diff --git a/test_libBIDS.sh b/test_libBIDS.sh index 9b90dba..c6df3a5 100755 --- a/test_libBIDS.sh +++ b/test_libBIDS.sh @@ -70,6 +70,36 @@ test_parse_filename() { return 0 } +test_parse_filename_datatype_derivatives() { + # Datatype must come from the file's parent directory anchored to a whole + # path component, not a substring anywhere in the path. + declare -A a + _libBIDSsh_parse_filename "/data/study_func_proj/sub-01/anat/sub-01_T1w.nii.gz" a + assert_equals "anat" "${a[datatype]:-}" "datatype must be anat despite 'func' in path" || return 1 + + declare -A b + _libBIDSsh_parse_filename "/data/sub-01/emg/sub-01_task-rest_emg.edf" b + assert_equals "emg" "${b[datatype]:-}" "emg datatype should be detected" || return 1 + + # No datatype directory -> NA + declare -A c + _libBIDSsh_parse_filename "/data/sub-01/sub-01_scans.tsv" c + assert_equals "NA" "${c[datatype]:-}" "missing datatype dir should be NA" || return 1 + + # Derivatives pipeline is the component right after 'derivatives/'. + declare -A d + _libBIDSsh_parse_filename "/data/derivatives/fmriprep/sub-01/anat/sub-01_desc-preproc_T1w.nii.gz" d + assert_equals "fmriprep" "${d[derivatives]:-}" "derivatives pipeline should be fmriprep" || return 1 + assert_equals "anat" "${d[datatype]:-}" "derivative datatype should be anat" || return 1 + + # A directory merely containing the word 'derivatives' must not match. + declare -A e + _libBIDSsh_parse_filename "/data/myderivatives/sub-01/anat/sub-01_T1w.nii.gz" e + assert_equals "NA" "${e[derivatives]:-}" "'myderivatives' must not be treated as derivatives" || return 1 + + return 0 +} + test_parse_bids_to_table() { local bids_dir="bids-examples/ds001" local table @@ -210,6 +240,7 @@ echo "Starting libBIDS.sh test suite..." echo "---" run_test "Internal: _libBIDSsh_parse_filename" test_parse_filename +run_test "Internal: _libBIDSsh_parse_filename datatype/derivatives" test_parse_filename_datatype_derivatives run_test "Public API: libBIDSsh_parse_bids_to_table" test_parse_bids_to_table run_test "Public API: libBIDSsh_table_filter" test_table_filter run_test "Public API: libBIDSsh_drop_na_columns" test_drop_na_columns