diff --git a/generate_entity_patterns.sh b/generate_entity_patterns.sh index a3a1955..dc3fcc2 100755 --- a/generate_entity_patterns.sh +++ b/generate_entity_patterns.sh @@ -53,5 +53,5 @@ printf '@(%s)\n' "$(jq -r '.objects.extensions[].value' schema.json \ | paste -sd'|')" echo -echo "### datatype regex (for _libBIDSsh_parse_filename) ###" -printf '(%s)\n' "$(jq -r '.objects.datatypes | keys[]' schema.json | paste -sd'|')" +echo "### datatype regex (anchored to a whole path component; for _libBIDSsh_parse_filename) ###" +printf '(^|/)(%s)$\n' "$(jq -r '.objects.datatypes | keys[]' schema.json | paste -sd'|')" diff --git a/libBIDS.sh b/libBIDS.sh index 60e5bc8..1e0bcf6 100755 --- a/libBIDS.sh +++ b/libBIDS.sh @@ -245,11 +245,28 @@ _libBIDSsh_parse_filename() { # Store the full path and filename arr[path]=$(tr -s / <<<"${path}") + arr[extension]="${filename#*.}" - # Extract from schema + + # Datatype is the BIDS datatype directory directly containing the file (the + # final component of the file's directory). Anchor to a whole path component + # so an unrelated substring (e.g. "func" in "study_func/") cannot match. # ./generate_entity_patterns.sh -> datatype regex - arr[datatype]=$(grep -o -E "(anat|beh|dwi|eeg|emg|fmap|func|ieeg|meg|micr|motion|mrs|nirs|perf|pet|phenotype)" <<<"$(dirname "${path}")" | head -1 || echo "NA") - arr[derivatives]=$(grep -o 'derivatives/.*/' <<<"${path}" | awk -F/ '{print $2}' || echo "NA") + local dir + dir=$(dirname "${path}") + if [[ ${dir} =~ (^|/)(anat|beh|dwi|eeg|emg|fmap|func|ieeg|meg|micr|motion|mrs|nirs|perf|pet|phenotype)$ ]]; then + arr[datatype]="${BASH_REMATCH[2]}" + else + arr[datatype]="NA" + fi + + # Derivatives pipeline name is the path component immediately following a + # "derivatives/" directory; NA when the file is not under one. + if [[ ${path} =~ (^|/)derivatives/([^/]+)/ ]]; then + arr[derivatives]="${BASH_REMATCH[2]}" + else + arr[derivatives]="NA" + fi local name_no_ext="${filename%%.*}" diff --git a/test_libBIDS.sh b/test_libBIDS.sh index 9b90dba..c6df3a5 100755 --- a/test_libBIDS.sh +++ b/test_libBIDS.sh @@ -70,6 +70,36 @@ test_parse_filename() { return 0 } +test_parse_filename_datatype_derivatives() { + # Datatype must come from the file's parent directory anchored to a whole + # path component, not a substring anywhere in the path. + declare -A a + _libBIDSsh_parse_filename "/data/study_func_proj/sub-01/anat/sub-01_T1w.nii.gz" a + assert_equals "anat" "${a[datatype]:-}" "datatype must be anat despite 'func' in path" || return 1 + + declare -A b + _libBIDSsh_parse_filename "/data/sub-01/emg/sub-01_task-rest_emg.edf" b + assert_equals "emg" "${b[datatype]:-}" "emg datatype should be detected" || return 1 + + # No datatype directory -> NA + declare -A c + _libBIDSsh_parse_filename "/data/sub-01/sub-01_scans.tsv" c + assert_equals "NA" "${c[datatype]:-}" "missing datatype dir should be NA" || return 1 + + # Derivatives pipeline is the component right after 'derivatives/'. + declare -A d + _libBIDSsh_parse_filename "/data/derivatives/fmriprep/sub-01/anat/sub-01_desc-preproc_T1w.nii.gz" d + assert_equals "fmriprep" "${d[derivatives]:-}" "derivatives pipeline should be fmriprep" || return 1 + assert_equals "anat" "${d[datatype]:-}" "derivative datatype should be anat" || return 1 + + # A directory merely containing the word 'derivatives' must not match. + declare -A e + _libBIDSsh_parse_filename "/data/myderivatives/sub-01/anat/sub-01_T1w.nii.gz" e + assert_equals "NA" "${e[derivatives]:-}" "'myderivatives' must not be treated as derivatives" || return 1 + + return 0 +} + test_parse_bids_to_table() { local bids_dir="bids-examples/ds001" local table @@ -210,6 +240,7 @@ echo "Starting libBIDS.sh test suite..." echo "---" run_test "Internal: _libBIDSsh_parse_filename" test_parse_filename +run_test "Internal: _libBIDSsh_parse_filename datatype/derivatives" test_parse_filename_datatype_derivatives run_test "Public API: libBIDSsh_parse_bids_to_table" test_parse_bids_to_table run_test "Public API: libBIDSsh_table_filter" test_table_filter run_test "Public API: libBIDSsh_drop_na_columns" test_drop_na_columns