From 1ad37029da5913fbd28ed01e890e67de506c74a7 Mon Sep 17 00:00:00 2001 From: cproof Date: Fri, 1 Aug 2025 16:31:01 +0000 Subject: [PATCH 1/4] FIX: allow sea explorer files to have different columns --- pyglider/seaexplorer.py | 134 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 3 deletions(-) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 6c20c2d..e4afb7c 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -7,6 +7,7 @@ import glob import logging import os +import warnings import numpy as np import polars as pl @@ -280,7 +281,23 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'): if not files: _log.warning(f'No *gli*.parquet files found in {indir}') return False - gli = pl.read_parquet(indir + '/*.gli.sub.*.parquet') + # lets figure out what columns to read for situations where the number of columns changes: + gli0 = pl.read_parquet(files[0]) + gli1 = pl.read_parquet(files[-1]) + columns = list(set(gli0.columns) & set(gli1.columns)) + columns = set([c for c in columns if len(c) > 0]) + dfs = [] + for f in files: + df = pl.read_parquet(f) + missing = columns - set(df.columns) + for col in missing: + df = df.with_columns(pl.lit(None).cast(pl.Float64).alias(col)) + + dfs.append(df) + gli = pl.concat(dfs, rechunk=True) + + #gli = pl.read_parquet(indir + '/*.gli.sub.*.parquet', columns=columns, + # missing_columns='insert') gli = drop_pre_1971_samples(gli) gli.write_parquet(outgli) _log.info(f'Done writing {outgli}') @@ -290,7 +307,22 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'): if not files: _log.warning(f'No *{kind}*.parquet files found in {indir}') return False - pld = pl.read_parquet(indir + '/*.pld1.' + kind + '.*.parquet') + gli0 = pl.read_parquet(files[0]) + gli1 = pl.read_parquet(files[-1]) + columns = list(set(gli0.columns) | set(gli1.columns)) + columns = set([c for c in columns if len(c) > 0]) + dfs = [] + for f in files: + df = pl.read_parquet(f) + missing = columns - set(df.columns) + for col in missing: + df = df.with_columns(pl.lit(None).cast(pl.Float64).alias(col)) + + dfs.append(df.select(sorted(columns))) + pld = pl.concat(dfs, rechunk=True) + + # pld = pl.read_parquet(indir + '/*.pld1.' + kind + '.*.parquet', + # missing_columns='insert', columns=columns) pld = drop_pre_1971_samples(pld) pld.write_parquet(outpld) @@ -338,6 +370,29 @@ def _remove_fill_values(df, fill_value=9999): return df +def _forward_fill(gli, todo='Lat'): + """Forward-fill the specified column (todo) to propagate the last good value at each row.""" + gli = gli.with_columns([ + pl.col(todo).fill_null(strategy="forward").alias("temp_fill") + ]) + gli = gli.with_columns([ + pl.when( + (pl.col(todo) == pl.col("temp_fill").shift(1)) & pl.col(todo).is_not_null() + ).then(np.nan).otherwise(pl.col(todo)).alias(todo) + ]) + gli = gli.drop("temp_fill") + return gli + + +def _drop_if(gli, todo='Lat', condit='DeadReckoning', value=1): + """Drop Lat if DeadReckoning is 1""" + gli = gli.with_columns([ + pl.when(pl.col(condit) == value).then(np.nan).otherwise(pl.col(todo)).alias(todo) + ]) + return gli + + + def raw_to_timeseries( indir, outdir, @@ -348,12 +403,66 @@ def raw_to_timeseries( maxgap=10, interpolate=False, fnamesuffix='', + deadreckon=False, + replace_attrs=None ): """ - A little different than above, for the 4-file version of the data set. + Convert raw seaexplorer data to a timeseries netcdf file. + + Parameters + ---------- + indir : str + Directory with the raw files are kept. + + outdir : str + Directory to write the matching ``*.nc`` files. + + deploymentyaml : str + YAML text file with deployment information for this glider. + + kind : 'raw' or 'sub' + The type of data to process. 'raw' is the full resolution data, 'sub' + is the sub-sampled data. The default is 'raw'. Note that realtime data is + typically sub-sampled. + + profile_filt_time : float + Time in seconds to use for filtering the profiles. Default is 100. + + profile_min_time : float + Minimum time in seconds for a profile to be considered a valid profile. + Default is 300. + + maxgap : float + Maximum gap in seconds to interpolate over. Default is 10. + + interpolate : bool + If *True*, interpolate the data to fill in gaps. Default is False. + + fnamesuffix : str + Suffix to add to the output file name. Default is ''. + + deadreckon : bool + If *True* use the dead reckoning latitude and longitude data from the glider. Default + is *False*, and latitude and longitude are linearly interpolated between surface fixes. + *False* is the default, and recommended to avoid a-physical underwater jumps. + + replace_attrs : dict or None + replace global attributes in the metadata after reading the metadata + file in. Helpful when processing runs with only a couple things that + change. + + + Returns + ------- + outname : str + Name of the output netcdf file. + """ deployment = utils._get_deployment(deploymentyaml) + if replace_attrs: + for att in replace_attrs: + deployment['metadata'][att] = replace_attrs[att] metadata = deployment['metadata'] ncvar = deployment['netcdf_variables'] @@ -365,6 +474,25 @@ def raw_to_timeseries( sensor = pl.read_parquet(f'{indir}/{id}-{kind}pld.parquet') sensor = _remove_fill_values(sensor) + # don't use lat/lon if deadreckoned: + if not deadreckon: + if not ncvar['latitude']['source'] == 'Lat': + warnings.warn("For deadreckon=False, it is suggested to use 'Lat' as the source for latitude.") + if not ncvar['longitude']['source'] == 'Lon': + warnings.warn("For deadreckon=False, it is suggested to use 'Lon' as the source for longitude.") + if 'DeadReckoning' in gli.columns: + _log.info('Not using deadreckoning; glider has DeadReckoning column') + gli = _drop_if(gli, todo='Lat', condit='DeadReckoning', value=1) + gli = _drop_if(gli, todo='Lon', condit='DeadReckoning', value=1) + else: + _log.info('Not using deadreckoning; glider does not have DeadReckoning column') + gli = _drop_if(gli, todo='Lat', condit='NavState', value=116) + gli = _drop_if(gli, todo='Lon', condit='NavState', value=116) + # drop a lat/lon if it is not unique. Happens when there + # are stale fixes. + gli = _forward_fill(gli, todo='Lat') + gli = _forward_fill(gli, todo='Lon') + # build a new data set based on info in `deploymentyaml.` # We will use ctd as the interpolant ds = xr.Dataset() From 0599cd94dc417371423c1b54f39a7f8f39d2c1e6 Mon Sep 17 00:00:00 2001 From: cproof Date: Fri, 1 Aug 2025 16:33:41 +0000 Subject: [PATCH 2/4] FIX: allow sea explorer files to have different columns --- pyglider/seaexplorer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index e4afb7c..2aa9073 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -284,7 +284,7 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'): # lets figure out what columns to read for situations where the number of columns changes: gli0 = pl.read_parquet(files[0]) gli1 = pl.read_parquet(files[-1]) - columns = list(set(gli0.columns) & set(gli1.columns)) + columns = list(set(gli0.columns) | set(gli1.columns)) columns = set([c for c in columns if len(c) > 0]) dfs = [] for f in files: @@ -293,7 +293,7 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'): for col in missing: df = df.with_columns(pl.lit(None).cast(pl.Float64).alias(col)) - dfs.append(df) + dfs.append(df.select(sorted(columns))) gli = pl.concat(dfs, rechunk=True) #gli = pl.read_parquet(indir + '/*.gli.sub.*.parquet', columns=columns, From 3f64dd392c4a838922c44b0045f11ad74f5981d4 Mon Sep 17 00:00:00 2001 From: cproof Date: Fri, 20 Mar 2026 17:33:05 +0000 Subject: [PATCH 3/4] version 20 Mar 2026 --- pyglider/seaexplorer.py | 10 ++++++++++ pyglider/slocum.py | 2 ++ pyglider/utils.py | 20 ++++++++++---------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 2aa9073..f2ececd 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -402,6 +402,7 @@ def raw_to_timeseries( profile_min_time=300, maxgap=10, interpolate=False, + start_time=None, fnamesuffix='', deadreckon=False, replace_attrs=None @@ -438,6 +439,9 @@ def raw_to_timeseries( interpolate : bool If *True*, interpolate the data to fill in gaps. Default is False. + start_time : str or None + Drop data if before this date - sometimes there are bad times. Default is *None* + fnamesuffix : str Suffix to add to the output file name. Default is ''. @@ -460,6 +464,7 @@ def raw_to_timeseries( """ deployment = utils._get_deployment(deploymentyaml) + print(deployment) if replace_attrs: for att in replace_attrs: deployment['metadata'][att] = replace_attrs[att] @@ -475,6 +480,7 @@ def raw_to_timeseries( sensor = _remove_fill_values(sensor) # don't use lat/lon if deadreckoned: + print(ncvar) if not deadreckon: if not ncvar['latitude']['source'] == 'Lat': warnings.warn("For deadreckon=False, it is suggested to use 'Lat' as the source for latitude.") @@ -626,6 +632,10 @@ def raw_to_timeseries( ds = ds.where(~np.isnan(keeps)) ds = ds.dropna(dim='time', how='all') + # drop dates before start_time + if start_time is not None: + ds = ds.where(ds.time >= np.datetime64(start_time), drop=True) + # some derived variables: ds = utils.get_glider_depth(ds) ds = utils.get_distance_over_ground(ds) diff --git a/pyglider/slocum.py b/pyglider/slocum.py index db811f8..5423036 100644 --- a/pyglider/slocum.py +++ b/pyglider/slocum.py @@ -906,8 +906,10 @@ def binary_to_timeseries( sensors = [time_base] baseind = None + print(thenames) for nn, name in enumerate(thenames): sensorname = ncvar[name]['source'] + print(sensorname, time_base) if not sensorname == time_base: sensors.append(sensorname) else: diff --git a/pyglider/utils.py b/pyglider/utils.py index 0e05151..dc439d1 100644 --- a/pyglider/utils.py +++ b/pyglider/utils.py @@ -347,10 +347,10 @@ def get_derived_eos_raw(ds): [ ('long_name', 'water salinity'), ('standard_name', 'sea_water_practical_salinity'), - ('units', '1e-3'), - ('comment', 'raw, uncorrected salinity'), + ('units', '1'), + ('comment', 'raw, uncorrected practical salinity; Units are also known as PSU'), ('sources', 'conductivity temperature pressure'), - ('method', 'get_derived_eos_raw'), + ('method', 'pyglider.utils.get_derived_eos_raw; gsw.conversions.SP_from_C'), ('observation_type', 'calulated'), ('instrument', 'instrument_ctd'), ('valid_max', 40.0), @@ -372,9 +372,9 @@ def get_derived_eos_raw(ds): ('long_name', 'water potential density'), ('standard_name', 'sea_water_potential_density'), ('units', 'kg m-3'), - ('comment', 'raw, uncorrected salinity'), + ('comment', 'raw, uncorrected potential density'), ('sources', 'salinity temperature pressure'), - ('method', 'get_derived_eos_raw'), + ('method', 'pyglider.utils.get_derived_eos_raw; gsw.density.sigma0 using gsw.SA_from_SP and gsw.CT_from_t'), ('observation_type', 'calulated'), ('instrument', 'instrument_ctd'), ('accuracy', 0.01), @@ -394,11 +394,11 @@ def get_derived_eos_raw(ds): ('long_name', 'Density'), ('standard_name', 'sea_water_density'), ('units', 'kg m-3'), - ('comment', 'raw, uncorrected salinity'), + ('comment', 'raw, uncorrected density'), ('observation_type', 'calulated'), ('sources', 'salinity temperature pressure'), ('instrument', 'instrument_ctd'), - ('method', 'get_derived_eos_raw'), + ('method', 'pyglider.utils.get_derived_eos_raw; gsw.density.rho using gsw.SA_from_SP and gsw.CT_from_t'), ('valid_min', 990.0), ('valid_max', 1040.0), ('accuracy', 0.01), @@ -417,10 +417,10 @@ def get_derived_eos_raw(ds): ('long_name', 'water potential temperature'), ('standard_name', 'sea_water_potential_temperature'), ('units', 'Celsius'), - ('comment', 'raw, uncorrected salinity'), + ('comment', 'raw, uncorrected potential temperature'), ('sources', 'salinity temperature pressure'), ('observation_type', 'calulated'), - ('method', 'get_derived_eos_raw'), + ('method', 'pyglider.utils.get_derived_eos_raw; gsw.conversions.pt0_from_t using gsw.SA_from_SP'), ('instrument', 'instrument_ctd'), ('accuracy', 0.002), ('precision', 0.001), @@ -558,7 +558,7 @@ def fill_metadata(ds, metadata, sensor_data): dt = ds.time.values ds.attrs['time_coverage_start'] = '%s' % dt[0] ds.attrs['time_coverage_end'] = '%s' % dt[-1] - + # make sure this is ISO readable.... ds.attrs['deployment_start'] = str(dt[0])[:19] ds.attrs['deployment_end'] = str(dt[-1])[:19] From bb3707cac9b15041fb048f1f6da80ee802ff6ac8 Mon Sep 17 00:00:00 2001 From: cproof Date: Fri, 20 Mar 2026 17:35:30 +0000 Subject: [PATCH 4/4] version 20 Mar 2026 --- pyglider/seaexplorer.py | 2 -- pyglider/slocum.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index f2ececd..d6d9032 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -464,7 +464,6 @@ def raw_to_timeseries( """ deployment = utils._get_deployment(deploymentyaml) - print(deployment) if replace_attrs: for att in replace_attrs: deployment['metadata'][att] = replace_attrs[att] @@ -480,7 +479,6 @@ def raw_to_timeseries( sensor = _remove_fill_values(sensor) # don't use lat/lon if deadreckoned: - print(ncvar) if not deadreckon: if not ncvar['latitude']['source'] == 'Lat': warnings.warn("For deadreckon=False, it is suggested to use 'Lat' as the source for latitude.") diff --git a/pyglider/slocum.py b/pyglider/slocum.py index 5423036..db811f8 100644 --- a/pyglider/slocum.py +++ b/pyglider/slocum.py @@ -906,10 +906,8 @@ def binary_to_timeseries( sensors = [time_base] baseind = None - print(thenames) for nn, name in enumerate(thenames): sensorname = ncvar[name]['source'] - print(sensorname, time_base) if not sensorname == time_base: sensors.append(sensorname) else: