From 1ad37029da5913fbd28ed01e890e67de506c74a7 Mon Sep 17 00:00:00 2001
From: cproof <cproof@cproof.arbutus>
Date: Fri, 1 Aug 2025 16:31:01 +0000
Subject: [PATCH 1/4] FIX: allow sea explorer files to have different columns

---
 pyglider/seaexplorer.py | 134 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 3 deletions(-)

diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py
index 6c20c2d..e4afb7c 100644
--- a/pyglider/seaexplorer.py
+++ b/pyglider/seaexplorer.py
@@ -7,6 +7,7 @@
 import glob
 import logging
 import os
+import warnings
 
 import numpy as np
 import polars as pl
@@ -280,7 +281,23 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'):
     if not files:
         _log.warning(f'No *gli*.parquet files found in {indir}')
         return False
-    gli = pl.read_parquet(indir + '/*.gli.sub.*.parquet')
+    # lets figure out what columns to read for situations where the number of columns changes:
+    gli0 = pl.read_parquet(files[0])
+    gli1 = pl.read_parquet(files[-1])
+    columns = list(set(gli0.columns) & set(gli1.columns))
+    columns = set([c for c in columns if len(c) > 0])
+    dfs = []
+    for f in files:
+        df = pl.read_parquet(f)
+        missing = columns - set(df.columns)
+        for col in missing:
+            df = df.with_columns(pl.lit(None).cast(pl.Float64).alias(col))
+
+        dfs.append(df)
+    gli = pl.concat(dfs, rechunk=True)
+
+    #gli = pl.read_parquet(indir + '/*.gli.sub.*.parquet', columns=columns,
+    #                      missing_columns='insert')
     gli = drop_pre_1971_samples(gli)
     gli.write_parquet(outgli)
     _log.info(f'Done writing {outgli}')
@@ -290,7 +307,22 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'):
     if not files:
         _log.warning(f'No *{kind}*.parquet files found in {indir}')
         return False
-    pld = pl.read_parquet(indir + '/*.pld1.' + kind + '.*.parquet')
+    gli0 = pl.read_parquet(files[0])
+    gli1 = pl.read_parquet(files[-1])
+    columns = list(set(gli0.columns) | set(gli1.columns))
+    columns = set([c for c in columns if len(c) > 0])
+    dfs = []
+    for f in files:
+        df = pl.read_parquet(f)
+        missing = columns - set(df.columns)
+        for col in missing:
+            df = df.with_columns(pl.lit(None).cast(pl.Float64).alias(col))
+
+        dfs.append(df.select(sorted(columns)))
+    pld = pl.concat(dfs, rechunk=True)
+
+    #    pld = pl.read_parquet(indir + '/*.pld1.' + kind + '.*.parquet',
+    #                          missing_columns='insert', columns=columns)
     pld = drop_pre_1971_samples(pld)
     pld.write_parquet(outpld)
 
@@ -338,6 +370,29 @@ def _remove_fill_values(df, fill_value=9999):
     return df
 
 
+def _forward_fill(gli, todo='Lat'):
+    """Forward-fill the specified column (todo) to propagate the last good value at each row."""
+    gli = gli.with_columns([
+        pl.col(todo).fill_null(strategy="forward").alias("temp_fill")
+    ])
+    gli = gli.with_columns([
+        pl.when(
+            (pl.col(todo) == pl.col("temp_fill").shift(1)) & pl.col(todo).is_not_null()
+        ).then(np.nan).otherwise(pl.col(todo)).alias(todo)
+    ])
+    gli = gli.drop("temp_fill")
+    return gli
+
+
+def _drop_if(gli, todo='Lat', condit='DeadReckoning', value=1):
+    """Drop Lat if DeadReckoning is 1"""
+    gli = gli.with_columns([
+        pl.when(pl.col(condit) == value).then(np.nan).otherwise(pl.col(todo)).alias(todo)
+    ])
+    return gli
+
+
+
 def raw_to_timeseries(
     indir,
     outdir,
@@ -348,12 +403,66 @@ def raw_to_timeseries(
     maxgap=10,
     interpolate=False,
     fnamesuffix='',
+    deadreckon=False,
+    replace_attrs=None
 ):
     """
-    A little different than above, for the 4-file version of the data set.
+    Convert raw seaexplorer data to a timeseries netcdf file.
+
+    Parameters
+    ----------
+    indir : str
+        Directory with the raw files are kept.
+
+    outdir : str
+        Directory to write the matching ``*.nc`` files.
+
+    deploymentyaml : str
+        YAML text file with deployment information for this glider.
+
+    kind : 'raw' or 'sub'
+        The type of data to process.  'raw' is the full resolution data, 'sub'
+        is the sub-sampled data.  The default is 'raw'.  Note that realtime data is
+        typically sub-sampled.
+
+    profile_filt_time : float
+        Time in seconds to use for filtering the profiles.  Default is 100.
+
+    profile_min_time : float
+        Minimum time in seconds for a profile to be considered a valid profile.
+        Default is 300.
+
+    maxgap : float
+        Maximum gap in seconds to interpolate over.  Default is 10.
+
+    interpolate : bool
+        If *True*, interpolate the data to fill in gaps.  Default is False.
+
+    fnamesuffix : str
+        Suffix to add to the output file name.  Default is ''.
+
+    deadreckon : bool
+        If *True* use the dead reckoning latitude and longitude data from the glider.  Default
+        is *False*, and latitude and longitude are linearly interpolated between surface fixes.
+        *False* is the default, and recommended to avoid a-physical underwater jumps.
+
+    replace_attrs : dict or None
+        replace global attributes in the metadata after reading the metadata
+        file in.  Helpful when processing runs with only a couple things that
+        change.
+
+
+    Returns
+    -------
+    outname : str
+        Name of the output netcdf file.
+
     """
 
     deployment = utils._get_deployment(deploymentyaml)
+    if replace_attrs:
+        for att in replace_attrs:
+            deployment['metadata'][att] = replace_attrs[att]
 
     metadata = deployment['metadata']
     ncvar = deployment['netcdf_variables']
@@ -365,6 +474,25 @@ def raw_to_timeseries(
     sensor = pl.read_parquet(f'{indir}/{id}-{kind}pld.parquet')
     sensor = _remove_fill_values(sensor)
 
+    # don't use lat/lon if deadreckoned:
+    if not deadreckon:
+        if not ncvar['latitude']['source'] == 'Lat':
+            warnings.warn("For deadreckon=False, it is suggested to use 'Lat' as the source for latitude.")
+        if not ncvar['longitude']['source'] == 'Lon':
+            warnings.warn("For deadreckon=False, it is suggested to use 'Lon' as the source for longitude.")
+        if 'DeadReckoning' in gli.columns:
+            _log.info('Not using deadreckoning; glider has DeadReckoning column')
+            gli = _drop_if(gli, todo='Lat', condit='DeadReckoning', value=1)
+            gli = _drop_if(gli, todo='Lon', condit='DeadReckoning', value=1)
+        else:
+            _log.info('Not using deadreckoning; glider does not have DeadReckoning column')
+            gli = _drop_if(gli, todo='Lat', condit='NavState', value=116)
+            gli = _drop_if(gli, todo='Lon', condit='NavState', value=116)
+        # drop a lat/lon if it is not unique.  Happens when there
+        # are stale fixes.
+        gli = _forward_fill(gli, todo='Lat')
+        gli = _forward_fill(gli, todo='Lon')
+
     # build a new data set based on info in `deploymentyaml.`
     # We will use ctd as the interpolant
     ds = xr.Dataset()

From 0599cd94dc417371423c1b54f39a7f8f39d2c1e6 Mon Sep 17 00:00:00 2001
From: cproof <cproof@cproof.arbutus>
Date: Fri, 1 Aug 2025 16:33:41 +0000
Subject: [PATCH 2/4] FIX: allow sea explorer files to have different columns

---
 pyglider/seaexplorer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py
index e4afb7c..2aa9073 100644
--- a/pyglider/seaexplorer.py
+++ b/pyglider/seaexplorer.py
@@ -284,7 +284,7 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'):
     # lets figure out what columns to read for situations where the number of columns changes:
     gli0 = pl.read_parquet(files[0])
     gli1 = pl.read_parquet(files[-1])
-    columns = list(set(gli0.columns) & set(gli1.columns))
+    columns = list(set(gli0.columns) | set(gli1.columns))
     columns = set([c for c in columns if len(c) > 0])
     dfs = []
     for f in files:
@@ -293,7 +293,7 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'):
         for col in missing:
             df = df.with_columns(pl.lit(None).cast(pl.Float64).alias(col))
 
-        dfs.append(df)
+        dfs.append(df.select(sorted(columns)))
     gli = pl.concat(dfs, rechunk=True)
 
     #gli = pl.read_parquet(indir + '/*.gli.sub.*.parquet', columns=columns,

From 3f64dd392c4a838922c44b0045f11ad74f5981d4 Mon Sep 17 00:00:00 2001
From: cproof <cproof@cproof.arbutus>
Date: Fri, 20 Mar 2026 17:33:05 +0000
Subject: [PATCH 3/4] version 20 Mar 2026

---
 pyglider/seaexplorer.py | 10 ++++++++++
 pyglider/slocum.py      |  2 ++
 pyglider/utils.py       | 20 ++++++++++----------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py
index 2aa9073..f2ececd 100644
--- a/pyglider/seaexplorer.py
+++ b/pyglider/seaexplorer.py
@@ -402,6 +402,7 @@ def raw_to_timeseries(
     profile_min_time=300,
     maxgap=10,
     interpolate=False,
+    start_time=None,
     fnamesuffix='',
     deadreckon=False,
     replace_attrs=None
@@ -438,6 +439,9 @@ def raw_to_timeseries(
     interpolate : bool
         If *True*, interpolate the data to fill in gaps.  Default is False.
 
+    start_time : str or None
+        Drop data if before this date - sometimes there are bad times. Default is *None*
+
     fnamesuffix : str
         Suffix to add to the output file name.  Default is ''.
 
@@ -460,6 +464,7 @@ def raw_to_timeseries(
     """
 
     deployment = utils._get_deployment(deploymentyaml)
+    print(deployment)
     if replace_attrs:
         for att in replace_attrs:
             deployment['metadata'][att] = replace_attrs[att]
@@ -475,6 +480,7 @@ def raw_to_timeseries(
     sensor = _remove_fill_values(sensor)
 
     # don't use lat/lon if deadreckoned:
+    print(ncvar)
     if not deadreckon:
         if not ncvar['latitude']['source'] == 'Lat':
             warnings.warn("For deadreckon=False, it is suggested to use 'Lat' as the source for latitude.")
@@ -626,6 +632,10 @@ def raw_to_timeseries(
         ds = ds.where(~np.isnan(keeps))
         ds = ds.dropna(dim='time', how='all')
 
+    # drop dates before start_time
+    if start_time is not None:
+        ds = ds.where(ds.time >= np.datetime64(start_time), drop=True)
+
     # some derived variables:
     ds = utils.get_glider_depth(ds)
     ds = utils.get_distance_over_ground(ds)
diff --git a/pyglider/slocum.py b/pyglider/slocum.py
index db811f8..5423036 100644
--- a/pyglider/slocum.py
+++ b/pyglider/slocum.py
@@ -906,8 +906,10 @@ def binary_to_timeseries(
     sensors = [time_base]
 
     baseind = None
+    print(thenames)
     for nn, name in enumerate(thenames):
         sensorname = ncvar[name]['source']
+        print(sensorname, time_base)
         if not sensorname == time_base:
             sensors.append(sensorname)
         else:
diff --git a/pyglider/utils.py b/pyglider/utils.py
index 0e05151..dc439d1 100644
--- a/pyglider/utils.py
+++ b/pyglider/utils.py
@@ -347,10 +347,10 @@ def get_derived_eos_raw(ds):
         [
             ('long_name', 'water salinity'),
             ('standard_name', 'sea_water_practical_salinity'),
-            ('units', '1e-3'),
-            ('comment', 'raw, uncorrected salinity'),
+            ('units', '1'),
+            ('comment', 'raw, uncorrected practical salinity; Units are also known as PSU'),
             ('sources', 'conductivity temperature pressure'),
-            ('method', 'get_derived_eos_raw'),
+            ('method', 'pyglider.utils.get_derived_eos_raw; gsw.conversions.SP_from_C'),
             ('observation_type', 'calulated'),
             ('instrument', 'instrument_ctd'),
             ('valid_max', 40.0),
@@ -372,9 +372,9 @@ def get_derived_eos_raw(ds):
             ('long_name', 'water potential density'),
             ('standard_name', 'sea_water_potential_density'),
             ('units', 'kg m-3'),
-            ('comment', 'raw, uncorrected salinity'),
+            ('comment', 'raw, uncorrected potential density'),
             ('sources', 'salinity temperature pressure'),
-            ('method', 'get_derived_eos_raw'),
+            ('method', 'pyglider.utils.get_derived_eos_raw; gsw.density.sigma0 using gsw.SA_from_SP and gsw.CT_from_t'),
             ('observation_type', 'calulated'),
             ('instrument', 'instrument_ctd'),
             ('accuracy', 0.01),
@@ -394,11 +394,11 @@ def get_derived_eos_raw(ds):
             ('long_name', 'Density'),
             ('standard_name', 'sea_water_density'),
             ('units', 'kg m-3'),
-            ('comment', 'raw, uncorrected salinity'),
+            ('comment', 'raw, uncorrected density'),
             ('observation_type', 'calulated'),
             ('sources', 'salinity temperature pressure'),
             ('instrument', 'instrument_ctd'),
-            ('method', 'get_derived_eos_raw'),
+            ('method', 'pyglider.utils.get_derived_eos_raw; gsw.density.rho using gsw.SA_from_SP and gsw.CT_from_t'),
             ('valid_min', 990.0),
             ('valid_max', 1040.0),
             ('accuracy', 0.01),
@@ -417,10 +417,10 @@ def get_derived_eos_raw(ds):
             ('long_name', 'water potential temperature'),
             ('standard_name', 'sea_water_potential_temperature'),
             ('units', 'Celsius'),
-            ('comment', 'raw, uncorrected salinity'),
+            ('comment', 'raw, uncorrected potential temperature'),
             ('sources', 'salinity temperature pressure'),
             ('observation_type', 'calulated'),
-            ('method', 'get_derived_eos_raw'),
+            ('method', 'pyglider.utils.get_derived_eos_raw; gsw.conversions.pt0_from_t using gsw.SA_from_SP'),
             ('instrument', 'instrument_ctd'),
             ('accuracy', 0.002),
             ('precision', 0.001),
@@ -558,7 +558,7 @@ def fill_metadata(ds, metadata, sensor_data):
     dt = ds.time.values
     ds.attrs['time_coverage_start'] = '%s' % dt[0]
     ds.attrs['time_coverage_end'] = '%s' % dt[-1]
-    
+
     # make sure this is ISO readable....
     ds.attrs['deployment_start'] = str(dt[0])[:19]
     ds.attrs['deployment_end'] = str(dt[-1])[:19]

From bb3707cac9b15041fb048f1f6da80ee802ff6ac8 Mon Sep 17 00:00:00 2001
From: cproof <cproof@cproof.arbutus>
Date: Fri, 20 Mar 2026 17:35:30 +0000
Subject: [PATCH 4/4] version 20 Mar 2026

---
 pyglider/seaexplorer.py | 2 --
 pyglider/slocum.py      | 2 --
 2 files changed, 4 deletions(-)

diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py
index f2ececd..d6d9032 100644
--- a/pyglider/seaexplorer.py
+++ b/pyglider/seaexplorer.py
@@ -464,7 +464,6 @@ def raw_to_timeseries(
     """
 
     deployment = utils._get_deployment(deploymentyaml)
-    print(deployment)
     if replace_attrs:
         for att in replace_attrs:
             deployment['metadata'][att] = replace_attrs[att]
@@ -480,7 +479,6 @@ def raw_to_timeseries(
     sensor = _remove_fill_values(sensor)
 
     # don't use lat/lon if deadreckoned:
-    print(ncvar)
     if not deadreckon:
         if not ncvar['latitude']['source'] == 'Lat':
             warnings.warn("For deadreckon=False, it is suggested to use 'Lat' as the source for latitude.")
diff --git a/pyglider/slocum.py b/pyglider/slocum.py
index 5423036..db811f8 100644
--- a/pyglider/slocum.py
+++ b/pyglider/slocum.py
@@ -906,10 +906,8 @@ def binary_to_timeseries(
     sensors = [time_base]
 
     baseind = None
-    print(thenames)
     for nn, name in enumerate(thenames):
         sensorname = ncvar[name]['source']
-        print(sensorname, time_base)
         if not sensorname == time_base:
             sensors.append(sensorname)
         else: