Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add SDN parameter/UOM URNs and names (TEMPSZ01, PSALSZ01, INFLTF01, UPAA, PSUX, ULPM, etc.) for TSG temperature, salinity, and flow rate vocabulary entries.
- Add `pre-commit` configuration running `ruff` format and lint (mirrors `make lint`) on every commit, with a `make install-hooks` target to install it.
- Add `ocean_data_parser_version` global attribute to all parsed datasets by routing the NMEA, Star-Oddi DAT, and Sunburst superCO2 notes parsers through `standardize_dataset`.
- dfo.odf: add option to ignore full paths from ODF_HEADER FILE_DESCRIPTION and
INSTRUMENT_HEADER DESCRIPTION attributes in odf.
Default to True for BIO and False for IML.

### Fixed

Expand Down
21 changes: 19 additions & 2 deletions ocean_data_parser/parsers/dfo/odf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,19 @@


def bio_odf(
path: str, global_attributes: dict = None, encoding="Windows-1252"
path: str,
global_attributes: dict = None,
encoding="Windows-1252",
drop_path_from_attributes=True,
) -> xarray.Dataset:
"""Bedford Institute of Ocean ODF format parser.

Args:
path (str): Path to the odf file to parse
global_attributes (dict): file specific global attributes
encoding (str): Encoding format of the file (default: Windows-1252)
drop_path_from_attributes (bool): Drop the path from the attributes
ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION

Returns:
dataset (xarray dataset): Parsed xarray dataset
Expand All @@ -89,17 +94,23 @@ def bio_odf(
vocabularies=["BIO", "GF3"],
global_attributes={**bio_global_attributes, **(global_attributes or {})},
encoding=encoding,
drop_path_from_attributes=drop_path_from_attributes,
)


def mli_odf(
path: str, global_attributes: dict = None, encoding="Windows-1252"
path: str,
global_attributes: dict = None,
encoding="Windows-1252",
drop_path_from_attributes=False,
) -> xarray.Dataset:
"""Maurice Lamontagne Institute ODF format parser.

Args:
path (str): Path to the odf file to parse
global_attributes (dict): file specific global attributes
drop_path_from_attributes (bool): Drop the path from the attributes
ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION
encoding (str): Encoding format of the file (default: Windows-1252)

Returns:
Expand All @@ -109,6 +120,7 @@ def mli_odf(
path,
vocabularies=["MLI", "GF3"],
global_attributes={**mli_global_attributes, **(global_attributes or {})},
drop_path_from_attributes=drop_path_from_attributes,
encoding=encoding,
)

Expand Down Expand Up @@ -141,13 +153,17 @@ def odf(
global_attributes: dict = None,
encoding: str = "Windows-1252",
filename_convention=FILE_NAME_CONVENTIONS,
drop_path_from_attributes: bool = False,
) -> xarray.Dataset:
"""ODF format parser.

Args:
path (str): Path to the odf file to parse
vocabularies (str): Vocabulary list to use for the vocabulary mapping
global_attributes (dict): file specific global attributes
drop_path_from_attributes (bool): Drop the path from the attributes
ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION

encoding (str): Encoding format of the file (default: Windows-1252)
filename_convention (str): File name convention to extract attributes.
Should be a regex expression.
Expand All @@ -159,6 +175,7 @@ def odf(
path,
vocabularies=vocabularies,
global_attributes={**odf_global_attributes, **(global_attributes or {})},
drop_path_from_attributes=drop_path_from_attributes,
encoding=encoding,
filename_convention=filename_convention,
)
33 changes: 33 additions & 0 deletions ocean_data_parser/parsers/dfo/odf_source/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,40 @@
]


def drop_path_from_header_attributes(header: dict) -> dict:
"""Drop paths from the some of the header parsed attributes.

Args:
header (dict): Header attributes

Returns:
dict: Header attributes without the path
"""

def _get_file(file_path: str) -> str:
return re.split(r"\\|\/", file_path)[-1]

attributes = [
("ODF_HEADER", "FILE_SPECIFICATION"),
("INSTRUMENT_HEADER", "DESCRIPTION"),
]

for header_key, attribute_key in attributes:
if header_key in header and header[header_key].get(attribute_key):
header[header_key][attribute_key] = _get_file(
header[header_key][attribute_key]
)

return header


def parse_odf(
odf_path: str,
global_attributes: dict = None,
vocabularies: list = None,
add_attributes_existing_variables: bool = True,
generate_new_vocabulary_variables: bool = True,
drop_path_from_attributes: bool = False,
encoding: str = "Windows-1252",
filename_convention=FILE_NAME_CONVENTIONS,
) -> xr.Dataset:
Expand All @@ -61,6 +89,8 @@ def parse_odf(
Defaults to True.
generate_new_vocabulary_variables (bool, optional): Generate vocabulary variables.
Defaults to True.
drop_path_from_attributes (bool): Drop the path from the attributes
ODF_HEADER FILE_SPECIFICATION and INSTRUMENT_HEADER DESCRIPTION
encoding (str, optional): Encoding format of the file. Defaults to "Windows-1252".
filename_convention (str, optional): File name convention to extract attributes.
Should be a regex expression.
Expand All @@ -78,6 +108,9 @@ def parse_odf(
metadata["EVENT_HEADER"]["DATA_TYPE"],
)

if drop_path_from_attributes:
metadata = drop_path_from_header_attributes(metadata)

# Write global and variable attributes
file_name_attributes = (
re.search(filename_convention, Path(odf_path).name)
Expand Down
47 changes: 47 additions & 0 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
)
from ocean_data_parser.parsers.dfo.odf_source.attributes import _review_station
from ocean_data_parser.parsers.dfo.odf_source.parser import _convert_odf_time
from ocean_data_parser.parsers.dfo.odf_source.process import (
drop_path_from_header_attributes,
)


def search_caplog_records(caplog, message, levelname=None):
Expand Down Expand Up @@ -460,6 +463,50 @@ def test_odf_station_in_globals(self, global_attributes, original_header, statio
)
assert response == station, f"Failed to retrieve station={station}"

@pytest.mark.parametrize(
("path", "expect"),
[
("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"),
(
"tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf",
"CTD_001.odf",
),
(
r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf",
"CTD_001.odf",
),
],
)
def test_odf_header_file_description_with_no_path(self, path, expect):
result = drop_path_from_header_attributes(
{"ODF_HEADER": {"FILE_SPECIFICATION": path}}
)
assert result["ODF_HEADER"]["FILE_SPECIFICATION"] == expect, (
"Failed to drop path from header attributes"
)

@pytest.mark.parametrize(
("path", "expect"),
[
("tests/parsers_test_files/dfo/odf/bio/CTD/CTD_001.odf", "CTD_001.odf"),
(
"tests\\\\parsers_test_files\\\\dfo\\\\odf\\\\bio\\\\CTD\\\\CTD_001.odf",
"CTD_001.odf",
),
(
r"\\tests\\parsers_test_files\\dfo\\odf\\bio\\CTD\\CTD_001.odf",
"CTD_001.odf",
),
],
)
def test_instrument_header_description_no_path(self, path, expect):
result = drop_path_from_header_attributes(
{"INSTRUMENT_HEADER": {"DESCRIPTION": path}}
)
assert result["INSTRUMENT_HEADER"]["DESCRIPTION"] == expect, (
"Failed to drop path from header attributes"
)


class TestODFBIOParser:
@pytest.mark.parametrize(
Expand Down
23 changes: 23 additions & 0 deletions tests/test_reference_netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,29 @@ def ignore_from_attr(attr, expression, placeholder):
reference.attrs.pop("ocean_data_parser_version", None)
test.attrs.pop("ocean_data_parser_version", None)

reference.attrs.pop("Conventions", None)
test.attrs.pop("Conventions", None)

# Normalize paths stripped by drop_path_from_header_attributes so older
# reference files (generated before path-stripping) still match.
def _basename(value):
if not isinstance(value, str):
return value
return re.split(r"\\|/", value)[-1]

for ds in (reference, test):
if "instrument_description" in ds.attrs:
ds.attrs["instrument_description"] = _basename(
ds.attrs["instrument_description"]
)
for attr in ("original_odf_header_json", "original_header"):
if attr in ds.attrs and isinstance(ds.attrs[attr], str):
ds.attrs[attr] = re.sub(
r'("(?:DESCRIPTION|FILE_SPECIFICATION)"\s*[:=]\s*")([^"]*)"',
lambda m: f'{m.group(1)}{_basename(m.group(2))}"',
ds.attrs[attr],
)

reference = _standardize_dataset(reference)
test = _standardize_dataset(test)

Expand Down
Loading