From bd3aab959047ec07868ff9ea9670b2f23d66f0e9 Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Wed, 18 Mar 2026 08:31:10 -0700 Subject: [PATCH 1/8] Add CTD thermal lag correction, YAML constants reader, and GSW salinity updates --- pyglider/ncprocess.py | 479 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 435 insertions(+), 44 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index b366d41..c04c9db 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -9,7 +9,13 @@ import netCDF4 import numpy as np import scipy.stats as stats +from scipy import signal import xarray as xr +import gsw +import yaml +from datetime import date + + import pyglider.utils as utils @@ -185,17 +191,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): with netCDF4.Dataset(outname, 'r+') as nc: nc.renameDimension('string%d' % trajlen, 'traj_strlen') - -def make_gridfiles( - inname, - outdir, - deploymentyaml, - *, - fnamesuffix='', - depth_bins=None, - dz=1, - starttime='1970-01-01', -): +def make_gridfiles(inname,outdir,deploymentyaml,*,fnamesuffix='',depth_bins=None,dz=1,starttime='1970-01-01',maskfunction=None,interp_variables=None): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -211,21 +207,18 @@ def make_gridfiles( location of deployment yaml file for the netCDF file. This should be the same yaml file that was used to make the timeseries file. - depth_bins : array, default = None - User-defined depth bins, for instance ``np.arange(0, 1000.1, 1)``. - If not None, these are the depth bins into which the data will be - gridded. If None, ``dz`` is used to generate bins between 0 and 1100m - dz : float, default = 1 - Vertical grid spacing in meters. Ignored if ``depth_bins`` is not None + Vertical grid spacing in meters. + + maskfunction : callable or None, optional + Function applied to the dataset before gridding, usually to choose what data will be set to NaN based on quality flags. Returns ------- outname : str - Name of gridded netCDF file. The gridded netCDF file has dimensions of + Name of gridded netCDF file. The gridded netCDF file has coordinates of 'depth' and 'profile', so each variable is gridded in depth bins and by - profile number. Each profile has a time, latitude, and longitude. - The depth values are the bin centers + profile number. Each profile has a time, latitude, and longitude. """ try: os.mkdir(outdir) @@ -233,10 +226,12 @@ def make_gridfiles( pass deployment = utils._get_deployment(deploymentyaml) - profile_meta = deployment['profile_variables'] - ds = xr.open_dataset(inname, decode_times=True) + + if maskfunction is not None: + ds = maskfunction(ds) + ds = ds.where(ds.time > np.datetime64(starttime), drop=True) _log.info(f'Working on: {inname}') _log.debug(str(ds)) @@ -316,7 +311,7 @@ def make_gridfiles( _log.info(f'{td} {len(dat)}') dsout[td] = ((xdimname), dat, profile_meta[td]) - ds = ds.drop('time_1970') + ds = drop_vars('time_1970') _log.info(f'Done times!') for k in ds.keys(): @@ -324,34 +319,38 @@ def make_gridfiles( continue _log.info('Gridding %s', k) good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] + if len(good) <= 0: - continue - if 'average_method' in ds[k].attrs: - average_method = ds[k].attrs['average_method'] - ds[k].attrs['processing'] = ( - f'Using average method {average_method} for ' - f'variable {k} following deployment yaml.' - ) - if average_method == 'geometric mean': - average_method = stats.gmean - ds[k].attrs['processing'] += ( - ' Using geometric mean implementation ' 'scipy.stats.gmean' - ) + continue + if 'QC_protocol' in ds[k].attrs.values(): + # QC variables are treated as discrete flags rather than continuous data. + # If a variable has a QC_protocol attribute, it is gridded using the + # maximum flag in each bin (e.g. any QC3 in a bin makes the gridded bin QC3). + method = np.nanmax else: - average_method = 'mean' + # variables are treated as continuous data. + # If a variable has a average_method attribute, it is gridded using the + # mean in each bin + if 'average_method' in ds[k].attrs.values(): + method = ds[k].attrs['average_method'] + if method == 'geometric mean': + method = stats.gmean + else: + method = 'mean' + dat, xedges, yedges, binnumber = stats.binned_statistic_2d( - ds['profile_index'].values[good], - ds['depth'].values[good], - values=ds[k].values[good], - statistic=average_method, - bins=[profile_bins, depth_bins], - ) + ds['profile_index'].values[good], + ds['depth'].values[good], + values=ds[k].values[good], + statistic=method, + bins=[profile_bins, depth_bins], + ) _log.debug(f'dat{np.shape(dat)}') dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs) - # fill gaps in data: - dsout[k].values = utils.gappy_fill_vertical(dsout[k].values) + if interp_variables is not None: + dsout[k] = interp_variables(dsout[k],ds[k]) # fix u and v, because they should really not be gridded... if ('water_velocity_eastward' in dsout.keys()) and ('u' in profile_meta.keys()): @@ -422,6 +421,398 @@ def make_gridfiles( return outname +def CPROOF_mask(ds): + """Mask QC4 samples in data variables (set to NaN) so gridding ignores them. + Does NOT shorten arrays or overwrite QC variables. + """ + _log = logging.getLogger(__name__) + + ds = ds.copy() + + for k in list(ds.data_vars): + # skip QC variables themselves + if k.endswith("_QC"): + continue + + qc_name = f"{k}_QC" + if qc_name not in ds: + continue + + # mask data where QC == 4, preserving dims/coords + ds[k] = ds[k].where(ds[qc_name] != 4) + ds[qc_name] = ds[qc_name].where(ds[qc_name] != 4) + + return ds + +def interpolate_vertical(var, attr): + """ + Optional: Interpolates variables over gaps of 50m. If the variable has 'QC_protocol' in the metadata, NaN gaps are filled + with QC1 (good flag) + + Parameters + ---------- + var: DataArray + Timeseries of a data variable + + attr: + + """ + + # QC variables: fill interpolatable NaN gaps with 1 + if 'QC_protocol' in attr.attrs.values(): + interp = var.interpolate_na(dim="depth", method="nearest", max_gap=50) + filled = np.isnan(var) & np.isfinite(interp) + return xr.where(filled, 1, var) + + # Continuous variables: linear interpolation + return var.interpolate_na(dim="depth", method="linear", max_gap=50) + +def read_ctd_constants_from_yaml(deploymentyaml): + """ + Read CTD correction constants from deployment YAML. + Missing values are returned as None. + """ + with open(deploymentyaml, "r") as f: + data = yaml.safe_load(f) or {} + ctd = data.get("glider_devices", {}).get("ctd", {}) + thermal = ctd.get("Thermal_lag_constants_[alpha,tau]", None) + alpha = None + tau = None + if thermal is not None: + if len(thermal) > 0: + alpha = thermal[0] + if len(thermal) > 1: + tau = thermal[1] + dTdC = ctd.get("dTdC", None) + return { + "alpha": alpha, + "tau": tau, + "dTdC": dTdC, + } +def ctd_constants(deployfile, *, alpha=None, tau=None, dTdC=None): + """ + Read CTD correction constants from YAML, allow kwargs to override, + warn on conflicts, and error if any required value is still missing. + """ + with open(deployfile, "r") as file: + data = yaml.safe_load(file) or {} + + atr = data.get("glider_devices", {}).get("ctd", {}) + + thermal = atr.get("Thermal_lag_constants_[alpha,tau]") + yaml_alpha = None + yaml_tau = None + if thermal is not None: + if len(thermal) > 0: + yaml_alpha = thermal[0] + if len(thermal) > 1: + yaml_tau = thermal[1] + + yaml_dTdC = atr.get("dTdC") + + vals_yaml = {"alpha": yaml_alpha, "tau": yaml_tau, "dTdC": yaml_dTdC} + vals_kw = {"alpha": alpha, "tau": tau, "dTdC": dTdC} + + out = {} + for key in vals_yaml: + yaml_val = vals_yaml[key] + kw_val = vals_kw[key] + + if kw_val is not None: + if yaml_val is not None and yaml_val != kw_val: + _log.warning( + "%s differs between YAML (%r) and kwargs (%r); using kwargs.", + key, yaml_val, kw_val + ) + out[key] = kw_val + else: + out[key] = yaml_val + + missing = [k for k, v in out.items() if v is None] + if missing: + raise ValueError( + f"Missing required CTD constants after checking kwargs and YAML: {missing}" + ) + + return out["alpha"], out["tau"], out["dTdC"] + +def get_conductivity_clean(ts0, dT, dz, flag_stdev, clean_stdev, accuracy=None): + + """ + Temporarily flags any data points that are more than 5 standard deviations away from the overall time series mean + for a given depth bin and profile bin, then recomputes the mean and standard deviation, excluding the temporarily flagged values. + Conductivity values that still differ from the mean by more than 3 standard deviations are flagged as 'bad' (QC 4). + If the difference between the 'bad' values and the mean is less than the accuracy of the sensor, then those points are not excluded. + + Parameters + ---------- + ts0 : timeseries DataArray + Timeseries of mission + + dT : float + Ssize of the profile bins + + dz : float + Size of depth bins in meters + + flag_stdev : float + Number of standard deviations to temporarily flag bad salinity values + + clean_stdev : float + Number of standard deviations to flag bad conductivity values, after removing the temporary bad values from the calculation + + accuracy : callable or None, optional + Accuracy of the sensor + + Returns + ------- + ts : timeseries + Timeseries of mission with new variable ' + """ + ts = ts0.copy(deep=True).load() + ts = ts.where(np.isfinite(ts.conductivity), drop=False) + Tbins = np.arange(np.min(ts.profile_index), np.max(ts.profile_index) + dT, dT) + zbins = np.arange(np.min(ts.depth), np.max(ts.depth) + dz, dz) + qc = np.full(ts.conductivity.shape, 4, dtype=int) + finite_cond = np.isfinite(ts.conductivity.values) + for n in range(len(Tbins) - 1): + ind_Tbin = ((ts.profile_index.values >= Tbins[n]) & + (ts.profile_index.values <= Tbins[n + 1]) &finite_cond) + if not np.any(ind_Tbin): + continue + cond = ts.conductivity.values[ind_Tbin] + depth = ts.depth.values[ind_Tbin] + ind_bad_z = np.zeros(len(cond), dtype=bool) + for m in range(len(zbins) - 1): + ind_zbin = (depth >= zbins[m]) & (depth <= zbins[m + 1]) + if not np.any(ind_zbin): + continue + cond_z = cond[ind_zbin] + cond_mean = np.nanmean(cond_z) + cond_std = np.nanstd(cond_z) + ind_flag = ((np.abs(cond_z - cond_mean) > flag_stdev * cond_std) & + (np.abs(cond_z - cond_mean) > accuracy)) + # If everything got flagged, skip second-pass cleaning + if np.all(ind_flag): + ind_bad = ind_flag + else: + clean_mean = np.nanmean(cond_z[~ind_flag]) + clean_std = np.nanstd(cond_z[~ind_flag]) + ind_bad = ( (np.abs(cond_z - clean_mean) > clean_stdev * clean_std) & + (np.abs(cond_z - clean_mean) > accuracy)) + ind_bad_z[ind_zbin] = ind_bad + qc_subset = np.where(ind_bad_z, 4, 1)# Good = 1, Bad = 4 + qc[ind_Tbin] = qc_subset + ts = ts.assign(conductivity_QC=(ts.conductivity.dims, qc)) + + return ts + + +def CPROOF_sal_interpolate_filter(ds): + """ + Function applied to the dataset before finding the internal temperature. Function interpolates temperature over bad data and small data gaps + to prevent errors from affecting the neighbouring cells. + + Parameters + ---------- + ds: DataArray + Timeseries of mission data + + Returns + ---------- + interp: DataArray + Timeseries of interpolated temperature + + """ + interp = ds["temperature"].where(ds["temperature_QC"] != 4) + qc4 = (ds["temperature_QC"] == 4) + qc4_buf = qc4.rolling(time=5, center=True, min_periods=1).max().astype(bool) + interp = interp.where(~qc4_buf) + + interp = interp.interpolate_na( + dim="time", + method="linear", + max_gap=np.timedelta64(60, "s")) + + return interp + +def correct_sal(ds, fn, alpha, tau, interpolate_filter = None): + """ + Function from Garau et al. (2011): estimates temperature inside the conductivity cell then recaluclates salinity + + Parameters + ---------- + ds: DataArray + Timeseries of mission data + + fn: float + Sampling frequency of the sensor + + alpha : float + Thermal lag strength constant for the sensor. + + tau: float + Thermal lag time constant for the sensor. + + interpolate_filter: callable or None, optional + Function applied to the dataset before finding the internal temperature. Function interpolates over bad data and small data gaps + to prevent errors from affecting the neighbouring cells. + + Returns + ---------- + sal: DataArray + Timeseries of salinity_adjusted calculated using the internal temperature of the conductivity cell. + """ + if interpolate_filter is not None: + temp = interpolate_filter(ds) + else: + temp = ds.temperature_adjusted + + a = 4 * fn * alpha * tau / (1 + 4*fn*tau) + b = 1 - 2 * a / alpha + aa = [1, b] + bb = [a, -a] + tempcorr = temp.values.copy() + tempcell = temp.values.copy() + good = ~np.isnan(tempcell) + tempcorr[good] = signal.lfilter(bb, aa, temp.values[good]) + tempcell = tempcell - tempcorr + sal = gsw.SP_from_C(ds.conductivity* 10, tempcell, ds.pressure) + + return sal + +def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha= None, tau = None, dTdC=None, accuracy = None): + + ts = xr.open_dataset(uncorrected_file) + + #############Identify anomalous conductivity values ############ + flag_stdev = 5 #number of standard deviations to temporarily flag bad salinity values + clean_stdev = 3 #number of standard deviations to flag bad conductivity values, after removing the temporary bad values from the calc + dT = 50 #size of the profile bins + dz = 5 #size of the depth bins + + ts0 = ts.copy() + ts0.conductivity[ts0.conductivity<0.1] = np.nan + ts = get_conductivity_clean(ts0, dT, dz, flag_stdev, clean_stdev, accuracy=0.0003) + + ############## Flag salinity as QC4 where there are QC4 conductivity values (bad in == bad out) + sal_QC = xr.where(ts.conductivity_QC == 4,4,1) + ts['salinity_QC'] = sal_QC + ts['temperature_QC'] =(('time'), np.ones_like(ts.temperature) ) + + #added to C-PROOF files for gridding function + vars_ = ['conductivity_QC','salinity_QC','temperature_QC'] + for var in vars_: + ts[var].attrs["average_method"] = "QC_protocol" + + ############## Resolve constants from kwargs and YAML + alpha, tau, dTdC = ctd_constants(deploymentyaml,alpha=alpha,tau=tau,dTdC=dTdC) + + temp_adj = ts.temperature.copy() + temp_adj.attrs = ts.temperature.attrs.copy() + temp_adj.attrs['comment'] = 'temperature [degC]' + + ############## if there is a time lag + if dTdC is not None: + dt = np.timedelta64(dTdC, 's') + temp_adj = temp_adj.interp(time=ts.time + dt) + temp_adj.attrs = ts.temperature.attrs.copy() + temp_adj.attrs['comment'] = 'temperature [degC]' + temp_adj.attrs['time_lag'] = f'{dTdC} second CT lag corrected' + ts.attrs['dTdC'] = f'{dTdC} second CT lag corrected' + + ts['temperature_adjusted'] = temp_adj + + ############## Build salinity_adjusted + if tau is not None: + dt = np.diff(ts.time.values).astype("timedelta64[s]").astype(int) + vals, counts = np.unique(dt, return_counts=True) + srate = vals[np.argmax(counts)] + fs = 1 / float(srate) + fn = 0.5 * fs + + s = correct_sal(ts,fn,alpha=alpha,tau=tau,) + + sal_adj = xr.where(ts.salinity_QC == 1, s, ts.salinity) + sal_adj.attrs = ts.salinity.attrs.copy() + sal_adj.attrs['thermal_lag'] = (f'adjusted salinity [psu] using a thermal lag correction with alpha = {alpha} and tau = {tau}') + if dTdC is not None: + sal_adj.attrs['time_lag'] = f'found using temperature_adjusted corrected for {dTdC} second CT lag' + + ts['salinity_adjusted'] = sal_adj + ts.attrs['correction_constants'] = f'alpha = {alpha}; tau={tau}' + + else: + sal_adj = xr.DataArray( + gsw.conversions.SP_from_C(10 *ts['conductivity'],ts['temperature_adjusted'],ts.pressure).values,dims=ts.salinity.dims,coords=ts.salinity.coords) + sal_adj.attrs = ts.salinity.attrs.copy() + if dTdC is not None: + sal_adj.attrs['time_lag'] = f'found using temperature_adjusted' + + ts['salinity_adjusted'] = sal_adj + + ############# Update metadata + ts.attrs['processing_details'] = 'Processing details are located on the C-PROOF website for this mission under the reports tab.' + ts.attrs['processing_tech'] = 'Lauryn Talbot; ltalbot@uvic.ca' + ts.attrs['citation'] = '"Klymak, J., & Ross, T. (2025). C-PROOF Underwater Glider Deployment Datasets [Data set]. Canadian-Pacific Robotic Ocean Observing Facility.doi:10.82534/44DS-K310"' + ts.attrs['references'] = 'https://doi.org/10.82534/44DS-K310' + ts.attrs['quality_flags']=['1 = good data; 3 = bad data, potentially correctable; 4 = bad data; 8 = estimated data'] + + # Uncorrected conductivity + ts['conductivity'].attrs['comment'] = 'uncorrected conductivity' + ts['conductivity_QC'] = ts.conductivity_QC + + # Uncorrected temperature + ts['temperature'].attrs['comment'] = 'uncorrected temperature [degC]' + ts['temperature_QC'] = ts.temperature_QC + + # Adjusted temperature + ts['temperature_adjusted_QC'] = ts['temperature_QC'] + + # Uncorrected salinity + ts['salinity'].attrs['comment'] = 'uncorrected salinity [psu]' + + # Corrected salinity + ts['salinity_adjusted_QC'] = ts['salinity_QC'] + + # Unadjusted density + ts['density'].attrs['comment'] = 'unadjusted density' + ts['density_QC'] = ts['salinity_QC'] #same as salinity since derived + + ########## Recalculate derived variables + + long = ts.longitude.fillna(ts.longitude.mean(skipna=True)) + lat = ts.latitude.fillna(ts.latitude.mean(skipna=True)) + sa_adj = gsw.SA_from_SP(ts['salinity_adjusted'],ts['pressure'],long,lat) + ct_adj = gsw.CT_from_t(sa_adj,ts['temperature_adjusted'],ts['pressure']) + ts['potential_density_adjusted'] = (('time'),1000 + gsw.density.sigma0(sa_adj,ct_adj).values) + ts['potential_density_adjusted'].attrs['comment'] = 'calculated using adjusted salinity' + ts['potential_density_adjusted_QC'] = ts['salinity_adjusted_QC'] + + ts['potential_temperature_adjusted'] = (('time'), + gsw.conversions.pt0_from_t(ts.salinity_adjusted,ts.temperature_adjusted,ts.pressure).values) + ts['potential_temperature_adjusted'].attrs['comment'] = 'calculated using adjusted salinity' + ts['potential_temperature_adjusted_QC'] = ts['salinity_adjusted_QC'] + + processing_date = date.today().strftime('%Y%m%d') + vars_ = ['salinity_adjusted','temperature_adjusted','potential_density_adjusted','potential_temperature_adjusted'] + for var in vars_: + ts[var].attrs['processing_date'] = processing_date + + vars_ = ['conductivity_QC','salinity_QC','temperature_QC','salinity_adjusted_QC', + 'temperature_adjusted_QC','potential_density_adjusted_QC','potential_temperature_adjusted'] + for var in vars_: + ts[var].attrs["comment"] = ['1 = good data; 3 = bad data, potentially correctable; 4 = bad data; 8 = estimated data'] + + ########### Make gridfile + deploy_name = ts.deployment_name + ts.to_netcdf(f'{ts_directory}/{deploy_name}_CTDadjusted.nc') + make_gridfiles(f'{ts_directory}/{deploy_name}_CTDadjusted.nc', f'{ds_directory}', deploymentyaml, fnamesuffix='_CTDadjusted', + maskfunction=CPROOF_mask,interp_variables=interpolate_vertical) + + return 'Files are saved' + + # aliases extract_L0timeseries_profiles = extract_timeseries_profiles From 19b4bdf91417d50684ffe7b3f49027b0358be653 Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 08:12:07 -0700 Subject: [PATCH 2/8] Fix deploymentyaml handling (support dict or path) --- .ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 + Untitled.ipynb | 6 + pyglider/ncprocess.py | 132 ++-- pyglider/ncprocess.py.save | 777 +++++++++++++++++++ 4 files changed, 840 insertions(+), 81 deletions(-) create mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 Untitled.ipynb create mode 100644 pyglider/ncprocess.py.save diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index c04c9db..fd8f0eb 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -191,7 +191,16 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): with netCDF4.Dataset(outname, 'r+') as nc: nc.renameDimension('string%d' % trajlen, 'traj_strlen') -def make_gridfiles(inname,outdir,deploymentyaml,*,fnamesuffix='',depth_bins=None,dz=1,starttime='1970-01-01',maskfunction=None,interp_variables=None): +def make_gridfiles(inname, + outdir, + deploymentyaml, + *, + fnamesuffix='', + depth_bins=None, + dz=1, + starttime='1970-01-01', + maskfunction=None, + interp_variables=None): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -211,7 +220,8 @@ def make_gridfiles(inname,outdir,deploymentyaml,*,fnamesuffix='',depth_bins=None Vertical grid spacing in meters. maskfunction : callable or None, optional - Function applied to the dataset before gridding, usually to choose what data will be set to NaN based on quality flags. + Function applied to the dataset before gridding, + usually to choose what data will be set to NaN based on quality flags. Returns ------- @@ -311,7 +321,7 @@ def make_gridfiles(inname,outdir,deploymentyaml,*,fnamesuffix='',depth_bins=None _log.info(f'{td} {len(dat)}') dsout[td] = ((xdimname), dat, profile_meta[td]) - ds = drop_vars('time_1970') + ds = ds.drop_vars('time_1970') _log.info(f'Done times!') for k in ds.keys(): @@ -319,7 +329,6 @@ def make_gridfiles(inname,outdir,deploymentyaml,*,fnamesuffix='',depth_bins=None continue _log.info('Gridding %s', k) good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] - if len(good) <= 0: continue if 'QC_protocol' in ds[k].attrs.values(): @@ -339,11 +348,11 @@ def make_gridfiles(inname,outdir,deploymentyaml,*,fnamesuffix='',depth_bins=None method = 'mean' dat, xedges, yedges, binnumber = stats.binned_statistic_2d( - ds['profile_index'].values[good], - ds['depth'].values[good], - values=ds[k].values[good], - statistic=method, - bins=[profile_bins, depth_bins], + ds['profile_index'].values[good], + ds['depth'].values[good], + values=ds[k].values[good], + statistic=method, + bins=[profile_bins, depth_bins], ) _log.debug(f'dat{np.shape(dat)}') @@ -467,74 +476,6 @@ def interpolate_vertical(var, attr): # Continuous variables: linear interpolation return var.interpolate_na(dim="depth", method="linear", max_gap=50) -def read_ctd_constants_from_yaml(deploymentyaml): - """ - Read CTD correction constants from deployment YAML. - Missing values are returned as None. - """ - with open(deploymentyaml, "r") as f: - data = yaml.safe_load(f) or {} - ctd = data.get("glider_devices", {}).get("ctd", {}) - thermal = ctd.get("Thermal_lag_constants_[alpha,tau]", None) - alpha = None - tau = None - if thermal is not None: - if len(thermal) > 0: - alpha = thermal[0] - if len(thermal) > 1: - tau = thermal[1] - dTdC = ctd.get("dTdC", None) - return { - "alpha": alpha, - "tau": tau, - "dTdC": dTdC, - } -def ctd_constants(deployfile, *, alpha=None, tau=None, dTdC=None): - """ - Read CTD correction constants from YAML, allow kwargs to override, - warn on conflicts, and error if any required value is still missing. - """ - with open(deployfile, "r") as file: - data = yaml.safe_load(file) or {} - - atr = data.get("glider_devices", {}).get("ctd", {}) - - thermal = atr.get("Thermal_lag_constants_[alpha,tau]") - yaml_alpha = None - yaml_tau = None - if thermal is not None: - if len(thermal) > 0: - yaml_alpha = thermal[0] - if len(thermal) > 1: - yaml_tau = thermal[1] - - yaml_dTdC = atr.get("dTdC") - - vals_yaml = {"alpha": yaml_alpha, "tau": yaml_tau, "dTdC": yaml_dTdC} - vals_kw = {"alpha": alpha, "tau": tau, "dTdC": dTdC} - - out = {} - for key in vals_yaml: - yaml_val = vals_yaml[key] - kw_val = vals_kw[key] - - if kw_val is not None: - if yaml_val is not None and yaml_val != kw_val: - _log.warning( - "%s differs between YAML (%r) and kwargs (%r); using kwargs.", - key, yaml_val, kw_val - ) - out[key] = kw_val - else: - out[key] = yaml_val - - missing = [k for k, v in out.items() if v is None] - if missing: - raise ValueError( - f"Missing required CTD constants after checking kwargs and YAML: {missing}" - ) - - return out["alpha"], out["tau"], out["dTdC"] def get_conductivity_clean(ts0, dT, dz, flag_stdev, clean_stdev, accuracy=None): @@ -684,7 +625,8 @@ def correct_sal(ds, fn, alpha, tau, interpolate_filter = None): def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha= None, tau = None, dTdC=None, accuracy = None): ts = xr.open_dataset(uncorrected_file) - + deployment = utils._get_deployment(deploymentyaml) + #############Identify anomalous conductivity values ############ flag_stdev = 5 #number of standard deviations to temporarily flag bad salinity values clean_stdev = 3 #number of standard deviations to flag bad conductivity values, after removing the temporary bad values from the calc @@ -705,9 +647,37 @@ def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha for var in vars_: ts[var].attrs["average_method"] = "QC_protocol" - ############## Resolve constants from kwargs and YAML - alpha, tau, dTdC = ctd_constants(deploymentyaml,alpha=alpha,tau=tau,dTdC=dTdC) + ############## Resolve constants from kwargs and YAML + atr = deployment.get("glider_devices", {}).get("ctd", {}) + thermal = atr.get("Thermal_lag_constants_[alpha,tau]") + yaml_alpha = None + yaml_tau = None + if thermal is not None: + if len(thermal) > 0: + yaml_alpha = thermal[0] + if len(thermal) > 1: + yaml_tau = thermal[1] + yaml_dTdC = atr.get("dTdC") + vals_yaml = {"alpha": yaml_alpha, "tau": yaml_tau, "dTdC": yaml_dTdC} + vals_kw = {"alpha": alpha, "tau": tau, "dTdC": dTdC} + out = {} + for key in vals_yaml: + yaml_val = vals_yaml[key] + kw_val = vals_kw[key] + if kw_val is not None: + if yaml_val is not None and yaml_val != kw_val: + _log.warning( + "%s differs between YAML (%r) and kwargs (%r); using kwargs.", + key, yaml_val, kw_val + ) + out[key] = kw_val + else: + out[key] = yaml_val + missing = [k for k, v in out.items() if v is None] + if missing: + raise ValueError( + f"Missing required CTD constants after checking kwargs and YAML: {missing}") temp_adj = ts.temperature.copy() temp_adj.attrs = ts.temperature.attrs.copy() temp_adj.attrs['comment'] = 'temperature [degC]' @@ -810,7 +780,7 @@ def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha make_gridfiles(f'{ts_directory}/{deploy_name}_CTDadjusted.nc', f'{ds_directory}', deploymentyaml, fnamesuffix='_CTDadjusted', maskfunction=CPROOF_mask,interp_variables=interpolate_vertical) - return 'Files are saved' + return diff --git a/pyglider/ncprocess.py.save b/pyglider/ncprocess.py.save new file mode 100644 index 0000000..bb2c8da --- /dev/null +++ b/pyglider/ncprocess.py.save @@ -0,0 +1,777 @@ +""" +Routines that are used for common processing of netcdf files after they have +been converted to standard timeseries. +""" + +import logging +import os + +import netCDF4 +import numpy as np +import scipy.stats as stats +from scipy import signal +import xarray as xr +import gsw +import yaml +from datetime import date + + + +import pyglider.utils as utils + +_log = logging.getLogger(__name__) + + +def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): + """ + Extract and save each profile from a timeseries netCDF. + + Parameters + ---------- + inname : str or Path + netcdf file to break into profiles + + outdir : str or Path + directory to place profiles + + deploymentyaml : str or Path + location of deployment yaml file for the netCDF file. This should + be the same yaml file that was used to make the timeseries file. + + force : bool, default False + Force an overwite even if profile netcdf already exists + """ + try: + os.mkdir(outdir) + except FileExistsError: + pass + + deployment = utils._get_deployment(deploymentyaml) + + meta = deployment['metadata'] + with xr.open_dataset(inname) as ds: + _log.info('Extracting profiles: opening %s', inname) + profiles = np.unique(ds.profile_index) + profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] + for p in profiles: + ind = np.where(ds.profile_index == p)[0] + dss = ds.isel(time=ind) + outname = outdir + '/' + utils.get_file_id(dss) + '.nc' + _log.info('Checking %s', outname) + if force or (not os.path.exists(outname)): + # this is the id for the whole file, not just this profile.. + dss['trajectory'] = utils.get_file_id(ds).encode() + trajlen = len(utils.get_file_id(ds).encode()) + dss['trajectory'].attrs['cf_role'] = 'trajectory_id' + dss['trajectory'].attrs['comment'] = ( + 'A trajectory is a single' + 'deployment of a glider and may span multiple data files.' + ) + dss['trajectory'].attrs['long_name'] = 'Trajectory/Deployment Name' + + # profile-averaged variables.... + profile_meta = deployment['profile_variables'] + if 'water_velocity_eastward' in dss.keys(): + dss['u'] = dss.water_velocity_eastward.mean() + dss['u'].attrs = profile_meta['u'] + + dss['v'] = dss.water_velocity_northward.mean() + dss['v'].attrs = profile_meta['v'] + elif 'u' in profile_meta: + dss['u'] = profile_meta['u'].get('_FillValue', np.nan) + dss['u'].attrs = profile_meta['u'] + + dss['v'] = profile_meta['v'].get('_FillValue', np.nan) + dss['v'].attrs = profile_meta['v'] + else: + dss['u'] = np.nan + dss['v'] = np.nan + + dss['profile_id'] = np.int32(p) + dss['profile_id'].attrs = profile_meta['profile_id'] + if '_FillValue' not in dss['profile_id'].attrs: + dss['profile_id'].attrs['_FillValue'] = -1 + dss['profile_id'].attrs['valid_min'] = np.int32( + dss['profile_id'].attrs['valid_min'] + ) + dss['profile_id'].attrs['valid_max'] = np.int32( + dss['profile_id'].attrs['valid_max'] + ) + + dss['profile_time'] = dss.time.mean() + dss['profile_time'].attrs = profile_meta['profile_time'] + # remove units so they can be encoded later: + try: + del dss.profile_time.attrs['units'] + del dss.profile_time.attrs['calendar'] + except KeyError: + pass + dss['profile_lon'] = dss.longitude.mean() + dss['profile_lon'].attrs = profile_meta['profile_lon'] + dss['profile_lat'] = dss.latitude.mean() + dss['profile_lat'].attrs = profile_meta['profile_lat'] + + dss['lat'] = dss['latitude'] + dss['lon'] = dss['longitude'] + dss['platform'] = np.int32(1) + comment = meta['glider_model'] + ' operated by ' + meta['institution'] + dss['platform'].attrs['comment'] = comment + dss['platform'].attrs['id'] = ( + meta['glider_name'] + meta['glider_serial'] + ) + dss['platform'].attrs['instrument'] = 'instrument_ctd' + dss['platform'].attrs['long_name'] = ( + meta['glider_model'] + dss['platform'].attrs['id'] + ) + dss['platform'].attrs['type'] = 'platform' + dss['platform'].attrs['wmo_id'] = meta['wmo_id'] + if '_FillValue' not in dss['platform'].attrs: + dss['platform'].attrs['_FillValue'] = -1 + + dss['lat_uv'] = np.nan + dss['lat_uv'].attrs = profile_meta['lat_uv'] + dss['lon_uv'] = np.nan + dss['lon_uv'].attrs = profile_meta['lon_uv'] + dss['time_uv'] = np.nan + dss['time_uv'].attrs = profile_meta['time_uv'] + + dss['instrument_ctd'] = np.int32(1.0) + dss['instrument_ctd'].attrs = profile_meta['instrument_ctd'] + if '_FillValue' not in dss['instrument_ctd'].attrs: + dss['instrument_ctd'].attrs['_FillValue'] = -1 + + dss.attrs['date_modified'] = str(np.datetime64('now')) + 'Z' + + # ancillary variables: link and create with values of 2. If + # we dont' want them all 2, then create these variables in the + # time series + to_fill = [ + 'temperature', + 'pressure', + 'conductivity', + 'salinity', + 'density', + 'lon', + 'lat', + 'depth', + ] + for name in to_fill: + qcname = name + '_qc' + dss[name].attrs['ancillary_variables'] = qcname + if qcname not in dss.keys(): + dss[qcname] = ('time', 2 * np.ones(len(dss[name]), np.int8)) + dss[qcname].attrs = utils.fill_required_qcattrs({}, name) + # 2 is "not eval" + # outname = outdir + '/' + utils.get_file_id(dss) + '.nc' + _log.info('Writing %s', outname) + timeunits = 'seconds since 1970-01-01T00:00:00Z' + timecalendar = 'gregorian' + try: + del dss.profile_time.attrs['_FillValue'] + del dss.profile_time.attrs['units'] + except KeyError: + pass + dss.to_netcdf( + outname, + encoding={ + 'time': { + 'units': timeunits, + 'calendar': timecalendar, + 'dtype': 'float64', + }, + 'profile_time': { + 'units': timeunits, + '_FillValue': -99999.0, + 'dtype': 'float64', + }, + }, + ) + + # add traj_strlen using bare ntcdf to make IOOS happy + with netCDF4.Dataset(outname, 'r+') as nc: + nc.renameDimension('string%d' % trajlen, 'traj_strlen') + +def make_gridfiles( +<<<<<<< HEAD + inname, + outdir, + deploymentyaml, + *, + fnamesuffix='', + depth_bins=None, + dz=1, + starttime='1970-01-01', +): +======= + inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=None, interp_variabls=None): +>>>>>>> 9e2b1db (Add CTD thermal lag correction, YAML constants reader, and GSW salinity updates) + """ + Turn a timeseries netCDF file into a vertically gridded netCDF. + + Parameters + ---------- + inname : str or Path + netcdf file to break into profiles + + outdir : str or Path + directory to place profiles + + deploymentyaml : str or Path + location of deployment yaml file for the netCDF file. This should + be the same yaml file that was used to make the timeseries file. + + depth_bins : array, default = None + User-defined depth bins, for instance ``np.arange(0, 1000.1, 1)``. + If not None, these are the depth bins into which the data will be + gridded. If None, ``dz`` is used to generate bins between 0 and 1100m + + dz : float, default = 1 + Vertical grid spacing in meters. Ignored if ``depth_bins`` is not None + + maskfunction : callable or None, optional + Function applied to the dataset before gridding, usually to choose what data will be set to NaN based on quality flags. + + Returns + ------- + outname : str + Name of gridded netCDF file. The gridded netCDF file has dimensions of + 'depth' and 'profile', so each variable is gridded in depth bins and by + profile number. Each profile has a time, latitude, and longitude. + The depth values are the bin centers + """ + try: + os.mkdir(outdir) + except FileExistsError: + pass + + deployment = utils._get_deployment(deploymentyaml) + profile_meta = deployment['profile_variables'] + ds = xr.open_dataset(inname, decode_times=True) + + if maskfunction is not None: + ds = maskfunction(ds) + + ds = ds.where(ds.time > np.datetime64(starttime), drop=True) + _log.info(f'Working on: {inname}') + _log.debug(str(ds)) + _log.debug(str(ds.time[0])) + _log.debug(str(ds.time[-1])) + + profiles = np.unique(ds.profile_index) + profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] + profile_bins = np.hstack((np.array(profiles) - 0.5, [profiles[-1] + 0.5])) + _log.debug(profile_bins) + Nprofiles = len(profiles) + _log.info(f'Nprofiles {Nprofiles}') + + if depth_bins is None: + # calculate depth bins using dz + depth_bins = np.arange(0, 1100.1, dz) + else: + # sanity check user-provided bins + if ( + depth_bins.ndim != 1 + or not np.all(np.isfinite(depth_bins)) + or not np.issubdtype(depth_bins.dtype, np.number) + ): + raise ValueError('Depth bins must be a 1D array of finite numbers') + if len(depth_bins) < 2: + raise ValueError('There must be at least two depth bins edges') + if not np.all(np.diff(depth_bins) > 0): + raise ValueError('Depth bin edges must be strictly increasing and non-overlapping') + + # calculate bin centers + depths = 0.5*(depth_bins[:-1] + depth_bins[1:]) + _log.debug(f'depth bins and centers {depth_bins} {{depths}}') + + xdimname = 'time' + dsout = xr.Dataset( + coords={'depth': ('depth', depths), 'profile': (xdimname, profiles)} + ) + dsout['depth'].attrs = { + 'units': 'm', + 'long_name': 'Depth', + 'standard_name': 'depth', + 'positive': 'down', + 'source': ds.depth.attrs["source"], + 'coverage_content_type': 'coordinate', + 'comment': 'center of depth bins', + } + + # Bin by profile index, for the mean time, lat, and lon values for each profile + ds['time_1970'] = ds.temperature.copy() + ds['time_1970'].values = ds.time.values.astype(np.float64) + for td in ('time_1970', 'longitude', 'latitude'): + good = np.where(~np.isnan(ds[td]) & (ds['profile_index'] % 1 == 0))[0] + dat, xedges, binnumber = stats.binned_statistic( + ds['profile_index'].values[good], + ds[td].values[good], + statistic='mean', + bins=[profile_bins], + ) + if td == 'time_1970': + td = 'time' + dat = dat.astype('timedelta64[ns]') + np.datetime64('1970-01-01T00:00:00') + _log.info(f'{td} {len(dat)}') + dsout[td] = (('time'), dat, ds[td].attrs) +<<<<<<< HEAD + + # Bin by profile index, for the profile start (min) and end (max) times + profile_lookup = {'profile_time_start': "min", 'profile_time_end': "max"} + good = np.where(~np.isnan(ds['time']) & (ds['profile_index'] % 1 == 0))[0] + for td, bin_stat in profile_lookup.items(): + _log.debug(f'td, bin_stat {td}, {bin_stat}') + dat, xedges, binnumber = stats.binned_statistic( + ds['profile_index'].values[good], + ds['time_1970'].values[good], + statistic=bin_stat, + bins=[profile_bins], + ) + dat = dat.astype('timedelta64[ns]') + np.datetime64('1970-01-01T00:00:00') + _log.info(f'{td} {len(dat)}') + dsout[td] = ((xdimname), dat, profile_meta[td]) + + ds = ds.drop('time_1970') + _log.info(f'Done times!') + +======= + ds.drop_vars('time_1970') + good = np.where(~np.isnan(ds['time']) & (ds['profile_index'] % 1 == 0))[0] + _log.info(f'Done times! {len(dat)}') + dsout['profile_time_start'] = ((xdimname), dat, profile_meta['profile_time_start']) + dsout['profile_time_end'] = ((xdimname), dat, profile_meta['profile_time_end']) + +>>>>>>> 9e2b1db (Add CTD thermal lag correction, YAML constants reader, and GSW salinity updates) + for k in ds.keys(): + if k in ['time', 'profile', 'longitude', 'latitude', 'depth'] or 'time' in k: + continue + _log.info('Gridding %s', k) + good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] + + if len(good) <= 0: + continue + if 'QC_protocol' in ds[k].attrs.values(): + # QC variables are treated as discrete flags rather than continuous data. + # If a variable has a QC_protocol attribute, it is gridded using the + # maximum flag in each bin (e.g. any QC3 in a bin makes the gridded bin QC3). + method = np.nanmax + else: + # variables are treated as continuous data. + # If a variable has a average_method attribute, it is gridded using the + # mean in each bin + if 'average_method' in ds[k].attrs.values(): + method = ds[k].attrs['average_method'] + if method == 'geometric mean': + method = stats.gmean + else: + method = 'mean' + + dat, xedges, yedges, binnumber = stats.binned_statistic_2d( + ds['profile_index'].values[good], + ds['depth'].values[good], + values=ds[k].values[good], + statistic=method, + bins=[profile_bins, depth_bins], + ) + + _log.debug(f'dat{np.shape(dat)}') + dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs) + + if interp_variables is not None: + dsout[k] = interp_variables(dsout[k],ds[k]) + + # fix u and v, because they should really not be gridded... + if ('water_velocity_eastward' in dsout.keys()) and ('u' in profile_meta.keys()): + _log.debug(str(ds.water_velocity_eastward)) + dsout['u'] = dsout.water_velocity_eastward.mean(axis=0) + dsout['u'].attrs = profile_meta['u'] + dsout['v'] = dsout.water_velocity_northward.mean(axis=0) + dsout['v'].attrs = profile_meta['v'] + dsout = dsout.drop(['water_velocity_eastward', 'water_velocity_northward']) + dsout.attrs = ds.attrs + dsout.attrs.pop('cdm_data_type') + + # fix to be ISO parsable: + if len(dsout.attrs['deployment_start']) > 18: + dsout.attrs['deployment_start'] = dsout.attrs['deployment_start'][:19] + dsout.attrs['deployment_end'] = dsout.attrs['deployment_end'][:19] + dsout.attrs['time_coverage_start'] = dsout.attrs['time_coverage_start'][:19] + dsout.attrs['time_coverage_end'] = dsout.attrs['time_coverage_end'][:19] + # fix standard_name so they don't overlap! + try: + dsout['waypoint_latitude'].attrs.pop('standard_name') + dsout['waypoint_longitude'].attrs.pop('standard_name') + dsout['profile_time_start'].attrs.pop('standard_name') + dsout['profile_time_end'].attrs.pop('standard_name') + except: + pass + # remove, so they can be encoded later: + try: + dsout['profile_time_start'].attrs.pop('units') + dsout['profile_time_end'].attrs.pop('units') + dsout['profile_time_start'].attrs.pop('_FillValue') + dsout['profile_time_end'].attrs.pop('_FillValue') + except: + pass + + # set some attributes for cf guidance + # see H.6.2. Profiles along a single trajectory + # https://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/aphs06.html + dsout.attrs['featureType'] = 'trajectoryProfile' + dsout['profile'].attrs['cf_role'] = 'profile_id' + dsout['mission_number'] = np.int32(1) + dsout['mission_number'].attrs['cf_role'] = 'trajectory_id' + dsout = dsout.set_coords(['latitude', 'longitude', 'time']) + for k in dsout: + if k in ['profile', 'depth', 'latitude', 'longitude', 'time', 'mission_number']: + dsout[k].attrs['coverage_content_type'] = 'coordinate' + else: + dsout[k].attrs['coverage_content_type'] = 'physicalMeasurement' + + outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid' + fnamesuffix + '.nc' + _log.info('Writing %s', outname) + # timeunits = 'nanoseconds since 1970-01-01T00:00:00Z' + time_encoding = { + 'units': 'seconds since 1970-01-01T00:00:00Z', + '_FillValue': np.nan, + 'calendar': 'gregorian', + 'dtype': 'float64', + } + dsout.to_netcdf( + outname, + encoding={ + 'time': time_encoding, + 'profile_time_start': time_encoding, + 'profile_time_end': time_encoding, + }, + ) + _log.info('Done gridding') + + return outname + +def CPROOF_mask(ds): + """Mask QC4 samples in data variables (set to NaN) so gridding ignores them. + Does NOT shorten arrays or overwrite QC variables. + """ + _log = logging.getLogger(__name__) + + ds = ds.copy() + + for k in list(ds.data_vars): + # skip QC variables themselves + if k.endswith("_QC"): + continue + + qc_name = f"{k}_QC" + if qc_name not in ds: + continue + + # mask data where QC == 4, preserving dims/coords + ds[k] = ds[k].where(ds[qc_name] != 4) + ds[qc_name] = ds[qc_name].where(ds[qc_name] != 4) + + return ds + +def interpolate_vertical(var, attr): + """ + Optional: Interpolates variables over gaps of 50m. If the variable has 'QC_protocol' in the metadata, NaN gaps are filled + with QC1 (good flag) + + Parameters + ---------- + var: DataArray + Timeseries of a data variable + + attr: + + """ + + # QC variables: fill interpolatable NaN gaps with 1 + if 'QC_protocol' in attr.attrs.values(): + interp = var.interpolate_na(dim="depth", method="nearest", max_gap=50) + filled = np.isnan(var) & np.isfinite(interp) + return xr.where(filled, 1, var) + + # Continuous variables: linear interpolation + return var.interpolate_na(dim="depth", method="linear", max_gap=50) + +def ctd_constants(deployfile): + """ + Read corrections for the thermal lag correction from the .ymal file + """ + with open(deployfile, 'r') as file: + data = yaml.safe_load(file) + atr = data['glider_devices']['ctd'] + if atr['Thermal_lag_constants_[alpha,tau]']: + alpha = atr['Thermal_lag_constants_[alpha,tau]'][0] + tau = atr['Thermal_lag_constants_[alpha,tau]'][1] + else: + alpha = None + tau = None + if atr['dTdC']: + dTdC = atr['dTdC'] + else: + dTdC = None + + return alpha,tau,dTdC + +def get_conductivity_clean(ts0, dT, dz, flag_stdev, clean_stdev, accuracy=None): + + """ + Temporarily flags any data points that are more than 5 standard deviations away from the overall time series mean + for a given depth bin and profile bin, then recomputes the mean and standard deviation, excluding the temporarily flagged values. + Conductivity values that still differ from the mean by more than 3 standard deviations are flagged as 'bad' (QC 4). + If the difference between the 'bad' values and the mean is less than the accuracy of the sensor, then those points are not excluded. + + Parameters + ---------- + ts0 : timeseries DataArray + Timeseries of mission + + dT : float + Ssize of the profile bins + + dz : float + Size of depth bins in meters + + flag_stdev : float + Number of standard deviations to temporarily flag bad salinity values + + clean_stdev : float + Number of standard deviations to flag bad conductivity values, after removing the temporary bad values from the calculation + + accuracy : callable or None, optional + Accuracy of the sensor + + Returns + ------- + ts : timeseries + Timeseries of mission with new variable ' + """ + ts = ts0.copy(deep=True).load() + ts = ts.where(np.isfinite(ts.conductivity), drop=False) + Tbins = np.arange(np.min(ts.profile_index), np.max(ts.profile_index) + dT, dT) + zbins = np.arange(np.min(ts.depth), np.max(ts.depth) + dz, dz) + qc = np.full(ts.conductivity.shape, 4, dtype=int) + finite_cond = np.isfinite(ts.conductivity.values) + for n in range(len(Tbins) - 1): + ind_Tbin = ((ts.profile_index.values >= Tbins[n]) & + (ts.profile_index.values <= Tbins[n + 1]) &finite_cond) + if not np.any(ind_Tbin): + continue + cond = ts.conductivity.values[ind_Tbin] + depth = ts.depth.values[ind_Tbin] + ind_bad_z = np.zeros(len(cond), dtype=bool) + for m in range(len(zbins) - 1): + ind_zbin = (depth >= zbins[m]) & (depth <= zbins[m + 1]) + if not np.any(ind_zbin): + continue + cond_z = cond[ind_zbin] + cond_mean = np.nanmean(cond_z) + cond_std = np.nanstd(cond_z) + ind_flag = ((np.abs(cond_z - cond_mean) > flag_stdev * cond_std) & + (np.abs(cond_z - cond_mean) > accuracy)) + # If everything got flagged, skip second-pass cleaning + if np.all(ind_flag): + ind_bad = ind_flag + else: + clean_mean = np.nanmean(cond_z[~ind_flag]) + clean_std = np.nanstd(cond_z[~ind_flag]) + ind_bad = ( (np.abs(cond_z - clean_mean) > clean_stdev * clean_std) & + (np.abs(cond_z - clean_mean) > accuracy)) + ind_bad_z[ind_zbin] = ind_bad + qc_subset = np.where(ind_bad_z, 4, 1)# Good = 1, Bad = 4 + qc[ind_Tbin] = qc_subset + ts = ts.assign(conductivity_QC=(ts.conductivity.dims, qc)) + + return ts + + +def CPROOF_sal_interpolate_filter(ds): + """ + Function applied to the dataset before finding the internal temperature. Function interpolates temperature over bad data and small data gaps + to prevent errors from affecting the neighbiuring cells. + + Parameters + ---------- + ds: DataArray + Timeseries of mission data + + Returns + ---------- + interp: DataArray + Timeseries of interpolated temperature + + """ + interp = ds["temperature"].where(ds["temperature_QC"] != 4) + qc4 = (ds["temperature_QC"] == 4) + qc4_buf = qc4.rolling(time=5, center=True, min_periods=1).max().astype(bool) + interp = interp.where(~qc4_buf) + + interp = interp.interpolate_na( + dim="time", + method="linear", + max_gap=np.timedelta64(60, "s")) + + return interp + +def correct_sal(ds, fn, alpha, tau, interpolate_filter = None): + """ + Function from Garau et al. (2011): estimates temperature inside the conductivity cell then recaluclates salinity + + Parameters + ---------- + ds: DataArray + Timeseries of mission data + + fn: float + Sampling frequency of the sensor + + alpha : float + Thermal lag strength constant for the sensor. + + tau: float + Thermal lag time constant for the sensor. + + interpolate_filter: callable or None, optional + Function applied to the dataset before finding the internal temperature. Function interpolates over bad data and small data gaps + to prevent errors from affecting the neighbouring cells. + + Returns + ---------- + sal: DataArray + Timeseries of salinity_adjusted calculated using the internal temperature of the conductivity cell. + """ + if interpolate_filter is not None: + temp = interpolate_filter(ds) + else: + temp = ds.temperature + + a = 4 * fn * alpha * tau / (1 + 4*fn*tau) + b = 1 - 2 * a / alpha + aa = [1, b] + bb = [a, -a] + tempcorr = temp.values.copy() + tempcell = temp.values.copy() + good = ~np.isnan(tempcell) + tempcorr[good] = signal.lfilter(bb, aa, temp.values[good]) + tempcell = tempcell - tempcorr + sal = gsw.SP_from_C(ds.conductivity* 10, tempcell, ds.pressure) + + return sal + +def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha= None, tau = None, dTdC=None, accuracy = None,): + + ts = xr.open_dataset(uncorrected_file) + + #############Identify anomalous conductivity values ############ + flag_stdev = 5 #number of standard deviations to temporarily flag bad salinity values + clean_stdev = 3 #number of standard deviations to flag bad conductivity values, after removing the temporary bad values from the calc + dT = 50 #size of the profile bins + dz = 5 #size of the depth bins + + ts0 = ts.copy() + ts0.conductivity[ts0.conductivity<0.1] = np.nan + ts = get_conductivity_clean(ts0, dT, dz, flag_stdev, clean_stdev, accuracy=0.0003) + + ############## Flag salinity as QC4 where there are QC4 conductivity values (bad in == bad out) + sal_QC = xr.where(ts.conductivity_QC == 4,4,1) + ts['salinity_QC'] = sal_QC + ts['temperature_QC'] =(('time'), np.ones_like(ts.temperature) ) + + #added to C-PROOF files for gridding function + vars_ = ['conductivity_QC','salinity_QC','temperature_QC'] + for var in vars_: + ts[var].attrs["average_method"] = "QC_protocol" + + ############## if there is a time lag + if dTdC is not None : + dt = np.timedelta64(dTdC, 's') + ts['temperature_adjusted'] = ts.temperature.interp(time=ts.time + dt) + ts['temperature_adjusted'].attrs['comment'] = 'temperature [degC]; CT lag correction applied' + ts['salinity_adjusted'] = (('time'), + gsw.conversions.SP_from_C(10*ts['conductivity'], ts['temperature_adjusted'], ts.pressure).values) + ts['salinity_adjusted'].attrs['comment'] = 'CT lag correction applied' + + if tau is not None: + dt = np.diff(ts.time.values).astype("timedelta64[s]").astype(int) + vals, counts = np.unique(dt, return_counts=True) + srate = vals[np.argmax(counts)] + fs = 1 / float(srate) + fn = 0.5 * fs + s =correct_sal(ts, fn, alpha=alpha, tau=tau, interpolate_filter= CPROOF_sal_interpolate_filter ) + ts['salinity_adjusted'] = xr.where(ts.salinity_QC == 1, s, ts.salinity) + ts['salinity_adjusted'].attrs['comment'] = f'adjusted salinity [psu] using a thermal lag correction with alpha = {alpha} and tau = {tau} ' + ts['temperature_adjusted'] = ts.temperature.copy() + ts['temperature_adjusted'].attrs['comment'] = 'temperature [degC]' + ts.attrs['correction_constants'] = f'alpha = {alpha}; tau={tau}' + + ############# Update metadata + ts.attrs['processing_details'] = 'Processing details are located on the C-PROOF website for this mission under the reports tab.' + ts.attrs['processing_tech'] = 'Lauryn Talbot; ltalbot@uvic.ca' + ts.attrs['citation'] = '"Klymak, J., & Ross, T. (2025). C-PROOF Underwater Glider Deployment Datasets [Data set]. Canadian-Pacific Robotic Ocean Observing Facility.doi:10.82534/44DS-K310"' + ts.attrs['references'] = 'https://doi.org/10.82534/44DS-K310' + ts.attrs['quality_flags']=['1 = good data; 3 = bad data, potentially correctable; 4 = bad data; 8 = estimated data'] + + # Uncorrected conductivity + ts['conductivity'].attrs['comment'] = 'uncorrected conductivity' + ts['conductivity_QC'] = ts.conductivity_QC + + # Uncorrected temperature + ts['temperature'].attrs['comment'] = 'uncorrected temperature [degC]' + ts['temperature_QC'] = ts.temperature_QC + + # Adjusted temperature + ts['temperature_adjusted_QC'] = ts['temperature_QC'] + + # Uncorrected salinity + ts['salinity'].attrs['comment'] = 'uncorrected salinity [psu]' + + # Corrected salinity + ts['salinity_adjusted_QC'] = ts['salinity_QC'] + + # Unadjusted density + ts['density'].attrs['comment'] = 'unadjusted density' + ts['density_QC'] = ts['salinity_QC'] #same as salinity since derived + + ########## Recalculate derived variables + + long = ts.longitude.fillna(ts.longitude.mean(skipna=True)) + lat = ts.latitude.fillna(ts.latitude.mean(skipna=True)) + sa_adj = gsw.SA_from_SP(ts['salinity_adjusted'],ts['pressure'],long,lat) + ct_adj = gsw.CT_from_t(sa_adj,ts['temperature_adjusted'],ts['pressure']) + ts['potential_density_adjusted'] = (('time'),1000 + gsw.density.sigma0(sa_adj,ct_adj).values) + ts['potential_density_adjusted'].attrs['comment'] = 'calculated using adjusted salinity' + ts['potential_density_adjusted_QC'] = ts['salinity_adjusted_QC'] + + ts['potential_temperature_adjusted'] = (('time'), + gsw.conversions.pt0_from_t(ts.salinity_adjusted,ts.temperature_adjusted,ts.pressure).values) + ts['potential_temperature_adjusted'].attrs['comment'] = 'calculated using adjusted salinity' + ts['potential_temperature_adjusted_QC'] = ts['salinity_adjusted_QC'] + + processing_date = date.today().strftime('%Y%m%d') + vars_ = ['salinity_adjusted','temperature_adjusted','potential_density_adjusted','potential_temperature_adjusted'] + for var in vars_: + ts[var].attrs['processing_date'] = processing_date + + vars_ = ['conductivity_QC','salinity_QC','temperature_QC','salinity_adjusted_QC', + 'temperature_adjusted_QC','potential_density_adjusted_QC','potential_temperature_adjusted'] + for var in vars_: + ts[var].attrs["comment"] = ['1 = good data; 3 = bad data, potentially correctable; 4 = bad data; 8 = estimated data'] + + ########### Make gridfile + deploy_name = ts.deployment_name + ts.to_netcdf(f'{ts_directory}/{deploy_name}_CTDadjusted.nc') + make_gridfiles(f'{ts_directory}/{deploy_name}_CTDadjusted.nc', f'{ds_directory}', deploymentyaml, fnamesuffix='_CTDadjusted', + maskfunction=CPROOF_mask,interp_variables=interpolate_vertical) + + return 'Files are saved' + + + +# aliases +extract_L0timeseries_profiles = extract_timeseries_profiles +make_L0_gridfiles = make_gridfiles + + +__all__ = ['extract_timeseries_profiles', 'make_gridfiles'] From f52ddb589ea56c2fb753face9f0200a092381526 Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 09:34:23 -0700 Subject: [PATCH 3/8] Added mask and interpolation parameters to CTD function to make it more general --- pyglider/ncprocess.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index fd8f0eb..d7858be 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -331,7 +331,7 @@ def make_gridfiles(inname, good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] if len(good) <= 0: continue - if 'QC_protocol' in ds[k].attrs.values(): + if d in ds[k].attrs.values(): # QC variables are treated as discrete flags rather than continuous data. # If a variable has a QC_protocol attribute, it is gridded using the # maximum flag in each bin (e.g. any QC3 in a bin makes the gridded bin QC3). @@ -622,7 +622,16 @@ def correct_sal(ds, fn, alpha, tau, interpolate_filter = None): return sal -def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha= None, tau = None, dTdC=None, accuracy = None): +def adjust_CTD(uncorrected_file, + deploymentyaml, + ts_directory, + ds_directory, + alpha= None, + tau = None, + dTdC=None, + accuracy = None, + maskfunction = None, + interp_variables = None): ts = xr.open_dataset(uncorrected_file) deployment = utils._get_deployment(deploymentyaml) @@ -778,7 +787,7 @@ def adjust_CTD(uncorrected_file,deploymentyaml,ts_directory, ds_directory, alpha deploy_name = ts.deployment_name ts.to_netcdf(f'{ts_directory}/{deploy_name}_CTDadjusted.nc') make_gridfiles(f'{ts_directory}/{deploy_name}_CTDadjusted.nc', f'{ds_directory}', deploymentyaml, fnamesuffix='_CTDadjusted', - maskfunction=CPROOF_mask,interp_variables=interpolate_vertical) + maskfunction= maskfunction,interp_variables= interp_variables) return From 1a5a89beb217f491facca0e3ebce1fc7bc582463 Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 09:36:43 -0700 Subject: [PATCH 4/8] Added mask and interpolation parameters to CTD function to make it more general --- pyglider/ncprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index d7858be..319b365 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -331,7 +331,7 @@ def make_gridfiles(inname, good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] if len(good) <= 0: continue - if d in ds[k].attrs.values(): + if 'QC_protocol' in ds[k].attrs.values(): # QC variables are treated as discrete flags rather than continuous data. # If a variable has a QC_protocol attribute, it is gridded using the # maximum flag in each bin (e.g. any QC3 in a bin makes the gridded bin QC3). From 1ccd3028b50ebce8135c40c5454ede3af26aa738 Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 10:13:33 -0700 Subject: [PATCH 5/8] Add adjust_CTD docs and ignore Jupyter checkpoints --- .gitignore | 1 + .ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 - docs/adjust_CTD.md | 118 +++++++++++++++++++ 3 files changed, 119 insertions(+), 6 deletions(-) delete mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 docs/adjust_CTD.md diff --git a/.gitignore b/.gitignore index 195440f..298ce16 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ dist *.png .DS_Store +*.ipynb_checkpoints diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 363fcab..0000000 --- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/adjust_CTD.md b/docs/adjust_CTD.md new file mode 100644 index 0000000..d944de9 --- /dev/null +++ b/docs/adjust_CTD.md @@ -0,0 +1,118 @@ +# PyGlider: Adjust CTD variables + +PyGlider applies a post-processing protocol to conductivity, temperature, and salinity variables in NetCDF timeseries files, and subsequently generates NetCDF depth–time grids using Python and `xarray`. The resulting NetCDF files are largely CF-compliant. + +The basic workflow consists of converting a NetCDF timeseries into an adjusted timeseries and corresponding depth–time grids. This follows the `binary_to_timeseries` (for Slocum gliders) and `raw_to_timeseries` (for Alseamar gliders) protocols, which convert raw glider data into NetCDF format. The variable `outname` refers to the timeseries output from this prior step. + +```python +outname_ctd = pyglider.ncprocess.adjust_CTD( + outname, + deploymentyaml, + l1tsdir, + griddir, + dTdC=None, + tau=None, + alpha=None, + maskfunction=None, + interp_variables=None +) +``` + +Data are read from and written to directories, and metadata are supplied via a YAML file. + +--- + +## Post-processing steps within `pyglider.ncprocess.adjust_CTD` + +### 1. Identify anomalous conductivity values + +We identify and flag conductivity values that are clearly unphysical, typically caused by air bubbles in the conductivity cell. + +A two-step statistical criterion is applied: + +- First, data points more than **5 standard deviations** from the mean are temporarily flagged within each depth and profile bin. +- The mean and standard deviation are then recomputed excluding these points. +- Values still exceeding **3 standard deviations** from the recomputed mean are flagged as **bad (QC = 4)** in `conductivity_QC`. + +However, if the deviation is smaller than the sensor accuracy (0.0003 S/m for the GPCTD), the data are retained. + +This procedure is applied using: + +- Profile bins of 50 profiles +- Depth bins of 5 m + +Using profile-index binning (rather than time or temperature) helps isolate unphysical values. + +Salinity (`salinity_QC`) is flagged as bad (QC = 4) wherever `conductivity_QC` is QC4. + +--- + +### 2. Determine what dTdC, tau, and alpha are used in the correction + +We correct for: + +- Sensor misalignment between temperature and conductivity (`dTdC`) +- Thermal lag effects (`tau`, `alpha`) + +Where: + +- `dTdC` = time lag (seconds) between temperature and conductivity sensors +- `tau` = thermal response time constant (seconds) +- `alpha` = scaling of thermal coupling between water and the conductivity cell + +Further details and methods for determining these parameters are available at: +https://cproof.uvic.ca/gliderdata/deployments/reports/ + +For recent C-PROOF missions, these values are included in the YAML file. However, users may: + +- Provide custom values for `dTdC`, `tau`, and `alpha`, or +- Skip corrections by setting parameters to `None` + +If user-supplied values differ from those in the YAML file, a warning is issued, but the user-provided values are used. + +New variables introduced: + +- `temperature_adjusted` +- `salinity_adjusted` +- `temperature_adjusted_QC` +- `salinity_adjusted_QC` + +--- + +### 3. Recalculate derived variables + +Using TEOS-10, we recompute: + +- `potential_density_adjusted` +- `potential_temperature_adjusted` + +Their corresponding QC variables: + +- `potential_density_adjusted_QC` +- `potential_temperature_adjusted_QC` + +These are flagged as bad (QC = 4) wherever `salinity_adjusted_QC` is QC4. + +--- + +### 4. Convert the adjusted NetCDF timeseries to a NetCDF depth-time grid + +The adjusted timeseries is converted into a gridded NetCDF dataset. + +- Default binning: + - 1 m depth bins + - Profile bins + +- Variables are averaged within each bin. + +QC variables use a `QC_protocol` that selects the maximum QC value within each bin, ensuring that bad data are not diluted. + +Optional parameters: + +- `maskfunction` +- `interp_variables` + +C-PROOF applies: + +- `pyglider.ncprocess.CPROOF_mask` to exclude QC4 data from gridded products +- `pyglider.ncprocess.interpolate_vertical` to interpolate over vertical gaps up to 50 m From 9dc7cf6c8aa8d69117c3d2a6bee65e5a14a9dfcc Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 10:51:24 -0700 Subject: [PATCH 6/8] Update docs and add CTD adjustment guidance --- .gitignore | 2 ++ docs/getting-started-seaexplorer.md | 12 +++++++++++- docs/getting-started-slocum.md | 12 +++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 298ce16..d10753f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ dist *.png .DS_Store *.ipynb_checkpoints +Untitled*.ipynb +*.py.save diff --git a/docs/getting-started-seaexplorer.md b/docs/getting-started-seaexplorer.md index 779a33a..ed7eaf9 100644 --- a/docs/getting-started-seaexplorer.md +++ b/docs/getting-started-seaexplorer.md @@ -31,7 +31,17 @@ The example script is relatively straight forward if there is no intermediate pr Data comes from an input directory, and is translated to raw glider-dependent parquet files files and put in a new directory. These files are useful of their own right. Apache Parquet is a columnar oriented format for storing tabular data. Parquet files take up less space than netCDF or csv and are much faster to read and write. These files can be opened with [polars.read_parquet](https://pola-rs.github.io/polars-book/user-guide/howcani/io/parquet.html) or [pandas.read_parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html). These files are then merged into a single monolithic parquet file, and this is translated to a CF-compliant timeseries netCDF file. Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved. -It is likely that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps. +Users may wish to include additional screening steps or calibration adjustments between these stages. + +PyGlider provides an optional function, `pyglider.ncprocess.adjust_CTD`, which: + +- identifies anomalous conductivity values +- corrects for sensor misalignment between temperature and conductivity +- applies a thermal lag correction + +This function can be used as a starting point and adapted depending on the dataset and application. + +See the full documentation: [CTD adjustment workflow](adjust_CTD.md). (ExDepl)= diff --git a/docs/getting-started-slocum.md b/docs/getting-started-slocum.md index fd01d30..42d7c72 100644 --- a/docs/getting-started-slocum.md +++ b/docs/getting-started-slocum.md @@ -29,7 +29,17 @@ Data comes from an input directory, and is translated into a single CF-compliant There is a version that does not require `dbdreader` to do the initial conversion from the Dinkum format to netCDF. However it is quite slow, particularly for full-resolution datasets, and less robust. We suggest using the `slocum.raw_to_timeseries`. ::: -It is possible that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps, but is designed so they are easy to add. +Users may wish to include additional screening steps or calibration adjustments between these stages. + +PyGlider provides an optional function, `pyglider.ncprocess.adjust_CTD`, which: + +- identifies anomalous conductivity values +- corrects for sensor misalignment between temperature and conductivity +- applies a thermal lag correction + +This function can be used as a starting point and adapted depending on the dataset and application. + +See the full documentation: [CTD adjustment workflow](adjust_CTD.md). (ExDeplSlocum)= From 53a221f07fbfb2f6a1fd8e0f3c903d87fd2a61ff Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 11:14:01 -0700 Subject: [PATCH 7/8] Adjusted pip installation --- .circleci/config.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 17da7d6..d9e7df8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,14 +8,16 @@ jobs: executor: python/default steps: - checkout + - run: + name: Upgrade pip and setuptools + command: python -m ensurepip --upgrade || true && python -m pip install --upgrade pip setuptools - run: name: Install Python dependencies command: | - python -m pip install --user \ - -r docs-requirements.txt + python -m pip install -r docs-requirements.txt - run: name: install module - command: python -m pip install --user -ve . + command: python -m pip install -ve . - run: name: Build docs command: cd docs/ && make html From 2d28aa4167932600ca864bb884b7c493106c074c Mon Sep 17 00:00:00 2001 From: Lauryn Talbot Date: Fri, 20 Mar 2026 12:34:00 -0700 Subject: [PATCH 8/8] Add CTD adjustment documentation page and include in toctree --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 3b4fcaa..2f03c80 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,7 +29,7 @@ Install getting-started-seaexplorer getting-started-slocum pyglider/pyglider - +adjust_CTD ``` ## Acknowledgements