diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f3e3576..59d6b4fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add SDN parameter/UOM URNs and names (TEMPSZ01, PSALSZ01, INFLTF01, UPAA, PSUX, ULPM, etc.) for TSG temperature, salinity, and flow rate vocabulary entries. - Add `pre-commit` configuration running `ruff` format and lint (mirrors `make lint`) on every commit, with a `make install-hooks` target to install it. - Add `ocean_data_parser_version` global attribute to all parsed datasets by routing the NMEA, Star-Oddi DAT, and Sunburst superCO2 notes parsers through `standardize_dataset`. +- dfo.odf: add option to ignore full paths from ODF_HEADER FILE_DESCRIPTION and +INSTRUMENT_HEADER DESCRIPTION attributes in odf. +Default to True for BIO and False for IML. ### Fixed diff --git a/ocean_data_parser/parsers/dfo/odf.py b/ocean_data_parser/parsers/dfo/odf.py index 6c7effe1..a64a7be2 100644 --- a/ocean_data_parser/parsers/dfo/odf.py +++ b/ocean_data_parser/parsers/dfo/odf.py @@ -72,7 +72,10 @@ def bio_odf( - path: str, global_attributes: dict = None, encoding="Windows-1252" + path: str, + global_attributes: dict = None, + encoding="Windows-1252", + drop_path_from_attributes=True, ) -> xarray.Dataset: """Bedford Institute of Ocean ODF format parser. @@ -80,6 +83,8 @@ def bio_odf( path (str): Path to the odf file to parse global_attributes (dict): file specific global attributes encoding (str): Encoding format of the file (default: Windows-1252) + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION Returns: dataset (xarray dataset): Parsed xarray dataset @@ -89,17 +94,23 @@ def bio_odf( vocabularies=["BIO", "GF3"], global_attributes={**bio_global_attributes, **(global_attributes or {})}, encoding=encoding, + drop_path_from_attributes=drop_path_from_attributes, ) def mli_odf( - path: str, global_attributes: dict = None, encoding="Windows-1252" + path: str, + global_attributes: dict = None, + encoding="Windows-1252", + drop_path_from_attributes=False, ) -> xarray.Dataset: """Maurice Lamontagne Institute ODF format parser. Args: path (str): Path to the odf file to parse global_attributes (dict): file specific global attributes + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION encoding (str): Encoding format of the file (default: Windows-1252) Returns: @@ -109,6 +120,7 @@ def mli_odf( path, vocabularies=["MLI", "GF3"], global_attributes={**mli_global_attributes, **(global_attributes or {})}, + drop_path_from_attributes=drop_path_from_attributes, encoding=encoding, ) @@ -141,6 +153,7 @@ def odf( global_attributes: dict = None, encoding: str = "Windows-1252", filename_convention=FILE_NAME_CONVENTIONS, + drop_path_from_attributes: bool = False, ) -> xarray.Dataset: """ODF format parser. @@ -148,6 +161,9 @@ def odf( path (str): Path to the odf file to parse vocabularies (str): Vocabulary list to use for the vocabulary mapping global_attributes (dict): file specific global attributes + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION + encoding (str): Encoding format of the file (default: Windows-1252) filename_convention (str): File name convention to extract attributes. Should be a regex expression. @@ -159,6 +175,7 @@ def odf( path, vocabularies=vocabularies, global_attributes={**odf_global_attributes, **(global_attributes or {})}, + drop_path_from_attributes=drop_path_from_attributes, encoding=encoding, filename_convention=filename_convention, ) diff --git a/ocean_data_parser/parsers/dfo/odf_source/process.py b/ocean_data_parser/parsers/dfo/odf_source/process.py index 075a7ca4..e91fbcca 100644 --- a/ocean_data_parser/parsers/dfo/odf_source/process.py +++ b/ocean_data_parser/parsers/dfo/odf_source/process.py @@ -40,12 +40,40 @@ ] +def drop_path_from_header_attributes(header: dict) -> dict: + """Drop paths from the some of the header parsed attributes. + + Args: + header (dict): Header attributes + + Returns: + dict: Header attributes without the path + """ + + def _get_file(file_path: str) -> str: + return re.split(r"\\|\/", file_path)[-1] + + attributes = [ + ("ODF_HEADER", "FILE_SPECIFICATION"), + ("INSTRUMENT_HEADER", "DESCRIPTION"), + ] + + for header_key, attribute_key in attributes: + if header_key in header and header[header_key].get(attribute_key): + header[header_key][attribute_key] = _get_file( + header[header_key][attribute_key] + ) + + return header + + def parse_odf( odf_path: str, global_attributes: dict = None, vocabularies: list = None, add_attributes_existing_variables: bool = True, generate_new_vocabulary_variables: bool = True, + drop_path_from_attributes: bool = False, encoding: str = "Windows-1252", filename_convention=FILE_NAME_CONVENTIONS, ) -> xr.Dataset: @@ -61,6 +89,8 @@ def parse_odf( Defaults to True. generate_new_vocabulary_variables (bool, optional): Generate vocabulary variables. Defaults to True. + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION encoding (str, optional): Encoding format of the file. Defaults to "Windows-1252". filename_convention (str, optional): File name convention to extract attributes. Should be a regex expression. @@ -78,6 +108,9 @@ def parse_odf( metadata["EVENT_HEADER"]["DATA_TYPE"], ) + if drop_path_from_attributes: + metadata = drop_path_from_header_attributes(metadata) + # Write global and variable attributes file_name_attributes = ( re.search(filename_convention, Path(odf_path).name) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 898586a5..0c35397f 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -25,6 +25,9 @@ ) from ocean_data_parser.parsers.dfo.odf_source.attributes import _review_station from ocean_data_parser.parsers.dfo.odf_source.parser import _convert_odf_time +from ocean_data_parser.parsers.dfo.odf_source.process import ( + drop_path_from_header_attributes, +) def search_caplog_records(caplog, message, levelname=None): @@ -460,6 +463,50 @@ def test_odf_station_in_globals(self, global_attributes, original_header, statio ) assert response == station, f"Failed to retrieve station={station}" + @pytest.mark.parametrize( + ("path", "expect"), + [ + ("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"), + ( + "tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", + "CTD_001.odf", + ), + ( + r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", + "CTD_001.odf", + ), + ], + ) + def test_odf_header_file_description_with_no_path(self, path, expect): + result = drop_path_from_header_attributes( + {"ODF_HEADER": {"FILE_SPECIFICATION": path}} + ) + assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, ( + "Failed to drop path from header attributes" + ) + + @pytest.mark.parametrize( + ("path", "expect"), + [ + ("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"), + ( + "tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", + "CTD_001.odf", + ), + ( + r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", + "CTD_001.odf", + ), + ], + ) + def test_instrument_header_description_no_path(self, path, expect): + result = drop_path_from_header_attributes( + {"INSTRUMENT_HEADER": {"DESCRIPTION": path}} + ) + assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, ( + "Failed to drop path from header attributes" + ) + class TestODFBIOParser: @pytest.mark.parametrize( diff --git a/tests/test_reference_netcdf.py b/tests/test_reference_netcdf.py index 25ecb5a4..ca9ccaea 100644 --- a/tests/test_reference_netcdf.py +++ b/tests/test_reference_netcdf.py @@ -131,6 +131,29 @@ def ignore_from_attr(attr, expression, placeholder): reference.attrs.pop("ocean_data_parser_version", None) test.attrs.pop("ocean_data_parser_version", None) + reference.attrs.pop("Conventions", None) + test.attrs.pop("Conventions", None) + + # Normalize paths stripped by drop_path_from_header_attributes so older + # reference files (generated before path-stripping) still match. + def _basename(value): + if not isinstance(value, str): + return value + return re.split(r"\\|/", value)[-1] + + for ds in (reference, test): + if "instrument_description" in ds.attrs: + ds.attrs["instrument_description"] = _basename( + ds.attrs["instrument_description"] + ) + for attr in ("original_odf_header_json", "original_header"): + if attr in ds.attrs and isinstance(ds.attrs[attr], str): + ds.attrs[attr] = re.sub( + r'("(?:DESCRIPTION|FILE_SPECIFICATION)"\s*[:=]\s*")([^"]*)"', + lambda m: f'{m.group(1)}{_basename(m.group(2))}"', + ds.attrs[attr], + ) + reference = _standardize_dataset(reference) test = _standardize_dataset(test)