From 71bd9c153203eb044244a0044fad790cd02714cd Mon Sep 17 00:00:00 2001 From: Jessy Barrette Date: Wed, 26 Jun 2024 16:25:23 -0400 Subject: [PATCH 1/7] ignore paths from some odf headers. --- ocean_data_parser/parsers/dfo/odf.py | 27 ++++++++++++--- .../parsers/dfo/odf_source/process.py | 34 +++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/ocean_data_parser/parsers/dfo/odf.py b/ocean_data_parser/parsers/dfo/odf.py index 731165165..82fd4ebae 100644 --- a/ocean_data_parser/parsers/dfo/odf.py +++ b/ocean_data_parser/parsers/dfo/odf.py @@ -57,28 +57,38 @@ } -def bio_odf(path: str, global_attributes: dict = None) -> xarray.Dataset: +def bio_odf( + path: str, global_attributes: dict = None, drop_path_from_attributes=True +) -> xarray.Dataset: """Bedford Institute of Ocean ODF format parser Args: path (str): Path to the odf file to parse global_attributes (dict): file specific global attributes + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION Returns: dataset (xarray dataset): Parsed xarray dataset """ - return _odf( + ds = _odf( path, vocabularies=["BIO", "GF3"], global_attributes={**bio_global_attributes, **(global_attributes or {})}, + drop_path_from_attributes=drop_path_from_attributes, ) + return ds -def mli_odf(path: str, global_attributes: dict = None) -> xarray.Dataset: +def mli_odf( + path: str, global_attributes: dict = None, drop_path_from_attributes=False +) -> xarray.Dataset: """Maurice Lamontagne Institute ODF format parser Args: path (str): Path to the odf file to parse global_attributes (dict): file specific global attributes + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION Returns: dataset (xarray dataset): Parsed xarray dataset """ @@ -86,16 +96,24 @@ def mli_odf(path: str, global_attributes: dict = None) -> xarray.Dataset: path, vocabularies=["MLI", "GF3"], global_attributes={**mli_global_attributes, **(global_attributes or {})}, + drop_path_from_attributes=drop_path_from_attributes, ) -def _odf(path: str, vocabularies: list = None, global_attributes: dict = None): +def _odf( + path: str, + vocabularies: list = None, + global_attributes: dict = None, + drop_path_from_attributes=False, +): """ODF format parser Args: path (str): Path to the odf file to parse vocabularies (str): Vocabulary list to use for the vocabulary mapping global_attributes (dict): file specific global attributes + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION Returns: dataset (xarray dataset): Parsed xarray dataset """ @@ -103,4 +121,5 @@ def _odf(path: str, vocabularies: list = None, global_attributes: dict = None): path, vocabularies=vocabularies, global_attributes={**odf_global_attributes, **(global_attributes or {})}, + drop_path_from_attributes=drop_path_from_attributes, ) diff --git a/ocean_data_parser/parsers/dfo/odf_source/process.py b/ocean_data_parser/parsers/dfo/odf_source/process.py index e4c91b33f..4ddc46478 100644 --- a/ocean_data_parser/parsers/dfo/odf_source/process.py +++ b/ocean_data_parser/parsers/dfo/odf_source/process.py @@ -40,12 +40,41 @@ ] +def drop_path_from_header_attributes(header: dict) -> dict: + """Drop paths from the some of the header parsed attributes. + + Args: + header (dict): Header attributes + + Returns: + dict: Header attributes without the path + """ + try: + header["ODF_HEADER"]["FILE_SPECIFICATION"] = Path( + header["ODF_HEADER"]["FILE_SPECIFICATION"] + ).name + if "INSTRUMENT_HEADER" in header and header["INSTRUMENT_HEADER"].get( + "DESCRIPTION" + ): + header["INSTRUMENT_HEADER"]["DESCRIPTION"] = " ".join( + [ + re.split(r"\\|\/", item)[-1] + for item in header["INSTRUMENT_HEADER"]["DESCRIPTION"].split(" ") + ] + ) + except Exception: + logger.error("Error while dropping path from header attributes", exc_info=True) + + return header + + def parse_odf( odf_path: str, global_attributes: dict = None, vocabularies: list = None, add_attributes_existing_variables: bool = True, generate_new_vocabulary_variables: bool = True, + drop_path_from_attributes: bool = False, ) -> xr.Dataset: """Convert an ODF file to an xarray object. @@ -59,6 +88,8 @@ def parse_odf( Defaults to True. generate_new_vocabulary_variables (bool, optional): Generate vocabulary variables. Defaults to True. + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION Returns: xr.Dataset: Parsed dataset @@ -73,6 +104,9 @@ def parse_odf( metadata["EVENT_HEADER"]["DATA_TYPE"], ) + if drop_path_from_attributes: + metadata = drop_path_from_header_attributes(metadata) + # Write global and variable attributes file_name_attributes = re.search(FILE_NAME_CONVENTIONS, Path(odf_path).name) if not file_name_attributes: From 74c07bc4387f38d502edcc6cc9614e87d7e52262 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:27:45 -0400 Subject: [PATCH 2/7] add to changelog.txt --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bbebcaaa..090ddcf3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add odpy convert `input_table` input through config file, which gives the ability to list multiple file glob expression and associated metadata. +- dfo.odf: add option to ignore full paths from ODF_HEADER FILE_DESCRIPTION and +INSTRUMENT_HEADER DESCRIPTION attributes in odf. +Default to True for BIO and False for IML. ## `0.5.2` - 2024-06-22 From 6084a837e5f2e6fcd8954b93f83e43c8353ea66b Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:49:58 -0400 Subject: [PATCH 3/7] fix logic and add tests --- .../parsers/dfo/odf_source/process.py | 26 ++++++++--------- tests/test_parsers.py | 28 ++++++++++++++++++- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/ocean_data_parser/parsers/dfo/odf_source/process.py b/ocean_data_parser/parsers/dfo/odf_source/process.py index 4ddc46478..c551e1f73 100644 --- a/ocean_data_parser/parsers/dfo/odf_source/process.py +++ b/ocean_data_parser/parsers/dfo/odf_source/process.py @@ -49,21 +49,19 @@ def drop_path_from_header_attributes(header: dict) -> dict: Returns: dict: Header attributes without the path """ - try: - header["ODF_HEADER"]["FILE_SPECIFICATION"] = Path( - header["ODF_HEADER"]["FILE_SPECIFICATION"] - ).name - if "INSTRUMENT_HEADER" in header and header["INSTRUMENT_HEADER"].get( - "DESCRIPTION" - ): - header["INSTRUMENT_HEADER"]["DESCRIPTION"] = " ".join( - [ - re.split(r"\\|\/", item)[-1] - for item in header["INSTRUMENT_HEADER"]["DESCRIPTION"].split(" ") - ] + def _get_file(file_path: str) -> str: + return re.split(r"\\|\/",file_path)[-1] + + attributes = [ + ("ODF_HEADER", "FILE_SPECIFICATION"), + ("INSTRUMENT_HEADER", "DESCRIPTION"), + ] + + for header_key, attribute_key in attributes: + if header_key in header and header[header_key].get(attribute_key): + header[header_key][attribute_key] = _get_file( + header[header_key][attribute_key] ) - except Exception: - logger.error("Error while dropping path from header attributes", exc_info=True) return header diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 92d9a7ce3..0fded8b39 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -23,6 +23,7 @@ ) from ocean_data_parser.parsers.dfo.odf_source.attributes import _review_station from ocean_data_parser.parsers.dfo.odf_source.parser import _convert_odf_time +from ocean_data_parser.parsers.dfo.odf_source.process import drop_path_from_header_attributes def search_caplog_records(caplog, message, levelname=None): @@ -346,6 +347,31 @@ def test_odf_station_in_globals(self, global_attributes, original_header, statio assert response == station, f"Failed to retrieve station={station}" + @pytest.mark.parametrize( + "path,expect", + [ + ("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"), + ("tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", "CTD_001.odf"), + (r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", "CTD_001.odf"), + ] + ) + def test_odf_header_file_description_with_no_path(self, path,expect): + result = drop_path_from_header_attributes({"ODF_HEADER":{"FILE_SPECIFICATION": path}}) + assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, f"Failed to drop path from header attributes" + + + @pytest.mark.parametrize( + "path, expect", + [ + ("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"), + ("tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", "CTD_001.odf"), + (r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", "CTD_001.odf"), + ] + ) + def test_instrument_header_description_no_path(self,path,expect): + result = drop_path_from_header_attributes({"INSTRUMENT_HEADER":{"DESCRIPTION": path}}) + assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, f"Failed to drop path from header attributes" + class TestODFBIOParser: @pytest.mark.parametrize( "path", glob("tests/parsers_test_files/dfo/odf/bio/**/CTD*.ODF", recursive=True) @@ -354,7 +380,7 @@ def test_bio_odf_ctd_parser(self, path, caplog): """Test DFO BIO ODF Parser""" ds = dfo.odf.bio_odf(path) review_parsed_dataset(ds, path, caplog) - + class TestODFMLIParser: @pytest.mark.parametrize( From ef84d0c095dbaae50568743c7332629b06b8640b Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Wed, 26 Jun 2024 17:19:52 -0400 Subject: [PATCH 4/7] fix ruff issues --- tests/test_parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 0fded8b39..baaeaa865 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -357,7 +357,7 @@ def test_odf_station_in_globals(self, global_attributes, original_header, statio ) def test_odf_header_file_description_with_no_path(self, path,expect): result = drop_path_from_header_attributes({"ODF_HEADER":{"FILE_SPECIFICATION": path}}) - assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, f"Failed to drop path from header attributes" + assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, "Failed to drop path from header attributes" @pytest.mark.parametrize( @@ -370,7 +370,7 @@ def test_odf_header_file_description_with_no_path(self, path,expect): ) def test_instrument_header_description_no_path(self,path,expect): result = drop_path_from_header_attributes({"INSTRUMENT_HEADER":{"DESCRIPTION": path}}) - assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, f"Failed to drop path from header attributes" + assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, "Failed to drop path from header attributes" class TestODFBIOParser: @pytest.mark.parametrize( From bdc9fb515507e7556e64783e792a6dd0b1c12acc Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Tue, 26 May 2026 16:00:47 -0400 Subject: [PATCH 5/7] refactor: update parser functions to improve readability and maintainability --- ocean_data_parser/parsers/dfo/odf.py | 18 ++++-- .../parsers/dfo/odf_source/process.py | 5 +- tests/test_parsers.py | 57 +++++++++++++------ 3 files changed, 55 insertions(+), 25 deletions(-) diff --git a/ocean_data_parser/parsers/dfo/odf.py b/ocean_data_parser/parsers/dfo/odf.py index 8537eecad..b5d445610 100644 --- a/ocean_data_parser/parsers/dfo/odf.py +++ b/ocean_data_parser/parsers/dfo/odf.py @@ -72,7 +72,10 @@ def bio_odf( - path: str, global_attributes: dict = None, encoding="Windows-1252", drop_path_from_attributes=True + path: str, + global_attributes: dict = None, + encoding="Windows-1252", + drop_path_from_attributes=True, ) -> xarray.Dataset: """Bedford Institute of Ocean ODF format parser. @@ -80,6 +83,8 @@ def bio_odf( path (str): Path to the odf file to parse global_attributes (dict): file specific global attributes encoding (str): Encoding format of the file (default: Windows-1252) + drop_path_from_attributes (bool): Drop the path from the attributes + ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION Returns: dataset (xarray dataset): Parsed xarray dataset @@ -89,13 +94,15 @@ def bio_odf( vocabularies=["BIO", "GF3"], global_attributes={**bio_global_attributes, **(global_attributes or {})}, encoding=encoding, - drop_path_from_attributes=drop_path_from_attributes + drop_path_from_attributes=drop_path_from_attributes, ) - return ds def mli_odf( - path: str, global_attributes: dict = None, encoding="Windows-1252", drop_path_from_attributes=False + path: str, + global_attributes: dict = None, + encoding="Windows-1252", + drop_path_from_attributes=False, ) -> xarray.Dataset: """Maurice Lamontagne Institute ODF format parser. @@ -146,6 +153,7 @@ def odf( global_attributes: dict = None, encoding: str = "Windows-1252", filename_convention=FILE_NAME_CONVENTIONS, + drop_path_from_attributes: bool = False, ) -> xarray.Dataset: """ODF format parser. @@ -155,7 +163,7 @@ def odf( global_attributes (dict): file specific global attributes drop_path_from_attributes (bool): Drop the path from the attributes ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION - + encoding (str): Encoding format of the file (default: Windows-1252) filename_convention (str): File name convention to extract attributes. Should be a regex expression. diff --git a/ocean_data_parser/parsers/dfo/odf_source/process.py b/ocean_data_parser/parsers/dfo/odf_source/process.py index 9ddeadbfc..e91fbcca0 100644 --- a/ocean_data_parser/parsers/dfo/odf_source/process.py +++ b/ocean_data_parser/parsers/dfo/odf_source/process.py @@ -49,9 +49,10 @@ def drop_path_from_header_attributes(header: dict) -> dict: Returns: dict: Header attributes without the path """ + def _get_file(file_path: str) -> str: - return re.split(r"\\|\/",file_path)[-1] - + return re.split(r"\\|\/", file_path)[-1] + attributes = [ ("ODF_HEADER", "FILE_SPECIFICATION"), ("INSTRUMENT_HEADER", "DESCRIPTION"), diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 199d5db71..0c35397ff 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -25,7 +25,9 @@ ) from ocean_data_parser.parsers.dfo.odf_source.attributes import _review_station from ocean_data_parser.parsers.dfo.odf_source.parser import _convert_odf_time -from ocean_data_parser.parsers.dfo.odf_source.process import drop_path_from_header_attributes +from ocean_data_parser.parsers.dfo.odf_source.process import ( + drop_path_from_header_attributes, +) def search_caplog_records(caplog, message, levelname=None): @@ -461,31 +463,50 @@ def test_odf_station_in_globals(self, global_attributes, original_header, statio ) assert response == station, f"Failed to retrieve station={station}" - @pytest.mark.parametrize( - "path,expect", + ("path", "expect"), [ ("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"), - ("tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", "CTD_001.odf"), - (r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", "CTD_001.odf"), - ] + ( + "tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", + "CTD_001.odf", + ), + ( + r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", + "CTD_001.odf", + ), + ], ) - def test_odf_header_file_description_with_no_path(self, path,expect): - result = drop_path_from_header_attributes({"ODF_HEADER":{"FILE_SPECIFICATION": path}}) - assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, "Failed to drop path from header attributes" + def test_odf_header_file_description_with_no_path(self, path, expect): + result = drop_path_from_header_attributes( + {"ODF_HEADER": {"FILE_SPECIFICATION": path}} + ) + assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, ( + "Failed to drop path from header attributes" + ) - @pytest.mark.parametrize( - "path, expect", + ("path", "expect"), [ ("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"), - ("tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", "CTD_001.odf"), - (r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", "CTD_001.odf"), - ] + ( + "tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf", + "CTD_001.odf", + ), + ( + r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf", + "CTD_001.odf", + ), + ], ) - def test_instrument_header_description_no_path(self,path,expect): - result = drop_path_from_header_attributes({"INSTRUMENT_HEADER":{"DESCRIPTION": path}}) - assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, "Failed to drop path from header attributes" + def test_instrument_header_description_no_path(self, path, expect): + result = drop_path_from_header_attributes( + {"INSTRUMENT_HEADER": {"DESCRIPTION": path}} + ) + assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, ( + "Failed to drop path from header attributes" + ) + class TestODFBIOParser: @pytest.mark.parametrize( @@ -495,7 +516,7 @@ def test_bio_odf_ctd_parser(self, path, caplog): """Test DFO BIO ODF Parser.""" ds = dfo.odf.bio_odf(path) review_parsed_dataset(ds, path, caplog) - + class TestODFMLIParser: @pytest.mark.parametrize( From 035ab4087c756f0a0f1be5e1390c3b2befccc40b Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Tue, 26 May 2026 16:15:28 -0400 Subject: [PATCH 6/7] feat: remove Conventions attribute and normalize paths in instrument_description and headers --- tests/test_reference_netcdf.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_reference_netcdf.py b/tests/test_reference_netcdf.py index 25ecb5a47..ca9ccaea3 100644 --- a/tests/test_reference_netcdf.py +++ b/tests/test_reference_netcdf.py @@ -131,6 +131,29 @@ def ignore_from_attr(attr, expression, placeholder): reference.attrs.pop("ocean_data_parser_version", None) test.attrs.pop("ocean_data_parser_version", None) + reference.attrs.pop("Conventions", None) + test.attrs.pop("Conventions", None) + + # Normalize paths stripped by drop_path_from_header_attributes so older + # reference files (generated before path-stripping) still match. + def _basename(value): + if not isinstance(value, str): + return value + return re.split(r"\\|/", value)[-1] + + for ds in (reference, test): + if "instrument_description" in ds.attrs: + ds.attrs["instrument_description"] = _basename( + ds.attrs["instrument_description"] + ) + for attr in ("original_odf_header_json", "original_header"): + if attr in ds.attrs and isinstance(ds.attrs[attr], str): + ds.attrs[attr] = re.sub( + r'("(?:DESCRIPTION|FILE_SPECIFICATION)"\s*[:=]\s*")([^"]*)"', + lambda m: f'{m.group(1)}{_basename(m.group(2))}"', + ds.attrs[attr], + ) + reference = _standardize_dataset(reference) test = _standardize_dataset(test) From 5fba0e1dc4afda568b7daee46a7a02810db40117 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Tue, 26 May 2026 16:15:44 -0400 Subject: [PATCH 7/7] fix: pass drop_path_from_attributes parameter correctly in mli_odf function --- ocean_data_parser/parsers/dfo/odf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocean_data_parser/parsers/dfo/odf.py b/ocean_data_parser/parsers/dfo/odf.py index b5d445610..a64a7be29 100644 --- a/ocean_data_parser/parsers/dfo/odf.py +++ b/ocean_data_parser/parsers/dfo/odf.py @@ -120,7 +120,7 @@ def mli_odf( path, vocabularies=["MLI", "GF3"], global_attributes={**mli_global_attributes, **(global_attributes or {})}, - drop_path_from_attributes=False, + drop_path_from_attributes=drop_path_from_attributes, encoding=encoding, )