From 2a2add56f9c5b763a459069bbc057bf2f898babb Mon Sep 17 00:00:00 2001 From: Eleanor Frajka-Williams Date: Sun, 7 Sep 2025 12:37:34 +0200 Subject: [PATCH 1/3] [TEST] Stage 3 tests --- notebooks/demo_stage2.ipynb | 308 +--------------- notebooks/demo_stage3.ipynb | 437 +++++------------------ oceanarray/generate_test_data.py | 51 +-- oceanarray/stage2.py | 158 +++++---- oceanarray/stage3.py | 584 +++++++++++++++++++++++++++++++ tests/test_stage2.py | 243 ++++++------- tests/test_stage3.py | 537 ++++++++++++++++++++++++++++ 7 files changed, 1467 insertions(+), 851 deletions(-) create mode 100644 oceanarray/stage3.py create mode 100644 tests/test_stage3.py diff --git a/notebooks/demo_stage2.ipynb b/notebooks/demo_stage2.ipynb index b593485..0a3a6c4 100644 --- a/notebooks/demo_stage2.ipynb +++ b/notebooks/demo_stage2.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "0ff1f4d0", "metadata": {}, "outputs": [], @@ -41,312 +41,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "518ff131", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "==================================================\n", - "Processing Stage 2 for mooring dsE_1_2018\n", - "==================================================\n", - "Starting Stage 2 processing for mooring: dsE_1_2018\n", - "Deployment time: 2018-08-12T22:44:00.000000000\n", - "Recovery time: 2018-08-26T10:38:00.000000000\n", - "Processing sbe56 serial 6363\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128476 to 111829 records\n", - "Final time range: 2018-08-13T12:00:00.000000000 to 2018-08-26T10:37:59.001600000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6363_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6363_use.nc\n", - "Processing sbe16 serial 2419\n", - "Removing variable: timeS\n", - "Applying clock offset: 259000 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 51670 to 47284 records\n", - "Final time range: 2018-08-15T11:56:41.000000000 to 2018-08-26T10:37:41.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe16/dsE_1_2018_2419_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe16/dsE_1_2018_2419_use.nc\n", - "Processing sbe56 serial 6401\n", - "Applying clock offset: -2400 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128349 to 112069 records\n", - "Final time range: 2018-08-13T11:20:00.000000000 to 2018-08-26T10:37:59.020800000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6401_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6401_use.nc\n", - "Processing sbe56 serial 6402\n", - "Applying clock offset: 950 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128417 to 111734 records\n", - "Final time range: 2018-08-13T12:15:50.000000000 to 2018-08-26T10:37:59.033600000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6402_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6402_use.nc\n", - "Processing sbe56 serial 8482\n", - "Applying clock offset: 2000 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/github/oceanarray/oceanarray/stage2.py:166: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.\n", - " with xr.open_dataset(raw_filepath) as ds:\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 136972 to 116604 records\n", - "Final time range: 2018-08-12T22:44:08.979200000 to 2018-08-26T10:37:59.014400000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_8482_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_8482_use.nc\n", - "Processing sbe56 serial 6365\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128443 to 111829 records\n", - "Final time range: 2018-08-13T12:00:00.000000000 to 2018-08-26T10:37:59.001600000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6365_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6365_use.nc\n", - "Processing sbe56 serial 6409\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128431 to 111829 records\n", - "Final time range: 2018-08-13T12:00:00.000000000 to 2018-08-26T10:37:59.001600000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6409_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6409_use.nc\n", - "Processing sbe56 serial 6397\n", - "Applying clock offset: -2400 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128405 to 112069 records\n", - "Final time range: 2018-08-13T11:20:00.000000000 to 2018-08-26T10:37:59.020800000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6397_use.nc\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6397_use.nc\n", - "Processing sbe56 serial 6366\n", - "Applying clock offset: -2400 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128392 to 112069 records\n", - "Final time range: 2018-08-13T11:20:00.000000000 to 2018-08-26T10:37:59.020800000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6366_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6366_use.nc\n", - "Processing sbe56 serial 6394\n", - "Applying clock offset: -2400 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128487 to 112069 records\n", - "Final time range: 2018-08-13T11:20:00.000000000 to 2018-08-26T10:37:59.020800000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6394_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6394_use.nc\n", - "Processing sbe56 serial 6370\n", - "Applying clock offset: -2400 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128455 to 112069 records\n", - "Final time range: 2018-08-13T11:20:00.000000000 to 2018-08-26T10:37:59.020800000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6370_use.nc\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6370_use.nc\n", - "WARNING: Raw file not found: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe16/dsE_1_2018_2418_raw.nc\n", - "Processing tr1050 serial 13889\n", - "Applying clock offset: 85620 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130255 to 111907 records\n", - "Final time range: 2018-08-13T11:47:00.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_13889_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_13889_use.nc\n", - "Processing rbrsolo serial 101651\n", - "Applying clock offset: 85970 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130929 to 111872 records\n", - "Final time range: 2018-08-13T11:52:50.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101651_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101651_use.nc\n", - "Processing tr1050 serial 15580\n", - "Applying clock offset: 85170 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130327 to 111952 records\n", - "Final time range: 2018-08-13T11:39:30.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_15580_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_15580_use.nc\n", - "Processing rbrsolo serial 101647\n", - "Applying clock offset: 85920 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130987 to 111877 records\n", - "Final time range: 2018-08-13T11:52:00.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101647_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101647_use.nc\n", - "Processing tr1050 serial 13874\n", - "Applying clock offset: 86256 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trimmed from 88987 to 88987 records\n", - "Final time range: 2018-08-13T11:57:36.000000000 to 2018-08-23T19:08:36.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_13874_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_13874_use.nc\n", - "Processing rbrsolo serial 101645\n", - "Applying clock offset: 85860 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 131002 to 111883 records\n", - "Final time range: 2018-08-13T11:51:00.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101645_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101645_use.nc\n", - "Processing tr1050 serial 15574\n", - "Applying clock offset: 85920 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130363 to 111877 records\n", - "Final time range: 2018-08-13T11:52:00.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_15574_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_15574_use.nc\n", - "Processing rbrsolo serial 101646\n", - "Applying clock offset: 85320 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130964 to 111937 records\n", - "Final time range: 2018-08-13T11:42:00.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101646_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/rbrsolo/dsE_1_2018_101646_use.nc\n", - "Processing tr1050 serial 15577\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n", - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Applying clock offset: 84420 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 130190 to 112027 records\n", - "Final time range: 2018-08-13T11:27:00.000000000 to 2018-08-26T10:38:00.000000000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_15577_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/tr1050/dsE_1_2018_15577_use.nc\n", - "Processing microcat serial 7518\n", - "Applying clock offset: -2050 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 124619 to 113473 records\n", - "Final time range: 2018-08-13T07:25:51.008000000 to 2018-08-26T10:37:50.979200000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/microcat/dsE_1_2018_7518_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/microcat/dsE_1_2018_7518_use.nc\n", - "Processing sbe56 serial 6364\n", - "Applying clock offset: -200 seconds\n", - "Trimming start to deployment time: 2018-08-12T22:44:00.000000000\n", - "Trimming end to recovery time: 2018-08-26T10:38:00.000000000\n", - "Trimmed from 128381 to 111849 records\n", - "Final time range: 2018-08-13T11:56:40.000000000 to 2018-08-26T10:37:59.017600000\n", - "Removed existing file: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6364_use.nc\n", - "Successfully wrote: /Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/moor/proc/dsE_1_2018/sbe56/dsE_1_2018_6364_use.nc\n", - "Stage 2 completed: 22/23 instruments successful\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/eddifying/Cloudfree/gitlab-cloudfree/ctd-tools/ctd_tools/writers/netcdf_writer.py:97: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", - " chunks.append(max(1, min(ds.dims[d], int(chunk_time))))\n" - ] - } - ], + "outputs": [], "source": [ "from oceanarray.stage2 import Stage2Processor, process_multiple_moorings_stage2\n", "\n", diff --git a/notebooks/demo_stage3.ipynb b/notebooks/demo_stage3.ipynb index 937c266..3f717ca 100644 --- a/notebooks/demo_stage3.ipynb +++ b/notebooks/demo_stage3.ipynb @@ -55,408 +55,157 @@ "metadata": {}, "outputs": [], "source": [ - "# Specify the base directory. raw is a subdirectory from here moor/raw/ and proc is moor/proc\n", - "basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/'\n", - "output_path = basedir + 'moor/proc/'\n", - "\n", - "# Toggle to load the *_raw.nc or the *_use.nc\n", - "file_subscript = '_use'\n", - "print(f\"Using files with {file_subscript}\")\n", - "\n", - "\n", - "\n", - "# Cycle through the yaml and load instrument data into a list of xarray datasets\n", - "# Enrich the netCDF with information from the yaml file\n", - "# Find the mooring's processed directory & read the yaml specification\n", - "name1 = moorlist[0]\n", - "proc_dir = output_path + name1\n", - "moor_yaml = proc_dir + '/' + name1 + '.mooring.yaml'\n", - "with open(moor_yaml, 'r') as f:\n", - " moor_yaml_data = yaml.safe_load(f)\n", - "\n", - "# For each instrument, load the raw netCDF files and add some metadata from the yaml\n", - "datasets = []\n", - "for i in moor_yaml_data['instruments']:\n", - " fname = name1 + '_' + str(i['serial']) + file_subscript + '.nc'\n", - " usefile = proc_dir + '/' + i['instrument'] + '/' + fname\n", - "\n", - " if os.path.exists(usefile):\n", - " print(usefile)\n", - " ds1 = xr.open_dataset(usefile)\n", + "from oceanarray.stage3 import Stage3Processor, process_multiple_moorings_stage3\n", "\n", - " if 'InstrDepth' not in ds1.variables and 'depth' in i:\n", - " ds1['InstrDepth'] = i['depth']\n", - " if 'instrument' not in ds1.variables and 'instrument' in i:\n", - " ds1['instrument'] = i['instrument']\n", - " if 'serial_number' not in ds1.variables and 'serial' in i:\n", - " ds1['serial_number'] = i['serial']\n", - " if 'timeS' in ds1.variables:\n", - " ds1 = ds1.drop_vars('timeS')\n", - " #---------------------------------------------\n", - " # Store the data in a list of datasets\n", - " datasets.append(ds1)\n", - "\n", - "\n" + "# Simple usage\n", + "moorlist = ['dsE_1_2018']\n", + "basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/'\n", + "results = process_multiple_moorings_stage3(moorlist, basedir, file_suffix='_use')" ] }, { "cell_type": "code", "execution_count": null, - "id": "915060e4", + "id": "c60445eb", "metadata": {}, "outputs": [], "source": [ - "# Collect some timing info for each instrument\n", - "intervals_min = []\n", - "start_times = []\n", - "end_times = []\n", - "# For each dataset, write the coverage\n", - "for idx, ds in enumerate(datasets):\n", - " time = ds['time']\n", - " start_time = str(time.values[0])\n", - " end_time = str(time.values[-1])\n", - " time_interval = (time.values[1] - time.values[0]) / np.timedelta64(1, 'm')\n", - " time_interval = np.nanmedian(np.diff(time.values) / np.timedelta64(1, 'm') )\n", - " if time_interval > 1:\n", - " tstr = f\"{time_interval:1.2f} min\"\n", - " else:\n", - " tstr = f\"{time_interval * 60:1.2f} sec\"\n", - " variables = list(ds.data_vars)\n", - " print(f\"Dataset {idx} depth {str(ds['InstrDepth'].values)} [{ds['instrument'].values}:{ds['serial_number'].values}]:\")\n", - " print(f\" Start time: {start_time[0:19]}. End time: {end_time[0:19]}. Time interval: {tstr}\")\n", - " print(f\" Coordinates: {list(ds.coords)}. Variables: {variables}\")\n", - "\n", - " #---------------------------------------------\n", - " # Save the interval for later use\n", - " intervals_min.append(time_interval)\n", - " start_times.append(time.values[0])\n", - " end_times.append(time.values[-1])\n", + "# Access individual results\n", + "print(results['dsE_1_2018']) # True or False\n", "\n", - "earliest_start = min(start_times)\n", - "latest_end = max(end_times)\n", - "time_grid = np.arange(earliest_start, latest_end, np.timedelta64(int(np.nanmedian(intervals_min) * 60), 's'))\n", + "# Check if a specific mooring succeeded\n", + "if results['dsE_1_2018']:\n", + " print(\"dsE_1_2018 processed successfully\")\n", + "else:\n", + " print(\"dsE_1_2018 failed to process\")\n", "\n", - "print(f\"Time grid length: {len(time_grid)}\")\n", - "print(f\"First time: {time_grid[0]}\")\n", - "print(f\"Last time: {time_grid[-1]}\")" - ] - }, - { - "cell_type": "markdown", - "id": "e81775c9", - "metadata": {}, - "source": [ - "# Interpolate dataset\n", - "\n", - "Here we interpolate data onto the same time grid to simply checking for clock offsets (in a later step)" + "# Iterate through all results\n", + "for mooring_name, success in results.items():\n", + " status = \"SUCCESS\" if success else \"FAILED\"\n", + " print(f\"{mooring_name}: {status}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "6bd66e8d", + "id": "c23478db", "metadata": {}, "outputs": [], "source": [ - "with xr.set_options(keep_attrs=True):\n", - " datasets_interp = []\n", - " for idx, ds in enumerate(datasets):\n", - " if 'time' not in ds.sizes or ds.sizes['time'] <= 1:\n", - " continue\n", - "\n", - " # interpolate the whole dataset at once\n", - " ds_i = ds.interp(time=time_grid)\n", - "\n", - " # preserve global attrs (Dataset.interp can drop them)\n", - " ds_i.attrs = dict(ds.attrs)\n", + "# For a successful mooring, the combined data is saved as:\n", + "# {proc_dir}/{mooring_name}/{mooring_name}_mooring_use.nc\n", "\n", - " # keep coord attrs too (optional)\n", - " if 'time' in ds.coords and ds.time.attrs:\n", - " ds_i.time.attrs = dict(ds.time.attrs)\n", + "# Load the combined dataset\n", + "import xarray as xr\n", "\n", - " # add any extra coords you want to carry through\n", - " if 'InstrDepth' in ds:\n", - " ds_i = ds_i.assign_coords(InstrDepth=ds['InstrDepth'])\n", - " if 'clock_offset' in ds:\n", - " ds_i = ds_i.assign_coords(clock_offset=ds['clock_offset'])\n", + "mooring_name = 'dsE_1_2018'\n", + "if results[mooring_name]: # Only load if processing succeeded\n", + " combined_file = f\"{basedir}/moor/proc/{mooring_name}/{mooring_name}_mooring_use.nc\"\n", + " combined_ds = xr.open_dataset(combined_file)\n", "\n", - " datasets_interp.append(ds_i)\n" + " print(f\"Combined dataset shape: {dict(combined_ds.dims)}\")\n", + " print(f\"Variables: {list(combined_ds.data_vars)}\")\n", + " print(f\"Instruments: {combined_ds.nominal_depth.values}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "6adbe82a", + "id": "b5b2b20b", "metadata": {}, "outputs": [], "source": [ + "# The combined dataset has structure like:\n", + "# Dimensions: (time: 8640, N_LEVELS: 3)\n", + "# Variables: temperature(time, N_LEVELS), salinity(time, N_LEVELS), etc.\n", "\n", + "# Access temperature data for all instruments\n", + "temp_data = combined_ds.temperature # Shape: (time, N_LEVELS)\n", "\n", - "import xarray as xr\n", - "\n", - "def merge_global_attrs(ds_list, *, order=\"last\"):\n", - " \"\"\"\n", - " Union of all global attrs across datasets.\n", - " If the same key appears multiple times, later datasets overwrite earlier ones (order='last').\n", - " Use order='first' to prefer the first occurrence instead.\n", - " \"\"\"\n", - " merged = {}\n", - " if order == \"last\":\n", - " it = ds_list\n", - " else: # 'first'\n", - " it = reversed(ds_list)\n", - " for ds in it:\n", - " if hasattr(ds, \"attrs\") and ds.attrs:\n", - " merged.update(dict(ds.attrs)) # later updates overwrite\n", - " return merged\n", - "\n", - "def merge_var_attrs(varname, ds_list, *, order=\"last\"):\n", - " \"\"\"\n", - " Union of attrs for a given variable across datasets in ds_list.\n", - " Later datasets overwrite earlier ones (order='last').\n", - " \"\"\"\n", - " merged = {}\n", - " if order == \"last\":\n", - " it = ds_list\n", - " else:\n", - " it = reversed(ds_list)\n", - " for ds in it:\n", - " if varname in ds and getattr(ds[varname], \"attrs\", None):\n", - " merged.update(dict(ds[varname].attrs))\n", - " return merged\n", - "\n", - "def copy_coord_attrs(src_ds: xr.Dataset, dst_ds: xr.Dataset, coord_names=(\"time\",)):\n", - " \"\"\"Copy attrs for specific coords from src to dst if present.\"\"\"\n", - " for c in coord_names:\n", - " if c in src_ds.coords and getattr(src_ds[c], \"attrs\", None):\n", - " dst_ds[c].attrs = dict(src_ds[c].attrs)\n", + "# Get temperature for a specific instrument level\n", + "temp_level_0 = combined_ds.temperature[:, 0] # First instrument\n", + "temp_level_1 = combined_ds.temperature[:, 1] # Second instrument\n", "\n", + "# Access metadata for each level\n", + "depths = combined_ds.nominal_depth.values\n", + "serials = combined_ds.serial_number.values\n", + "clock_offsets = combined_ds.clock_offset.values\n", "\n", - "#combined_ds.attrs = common_attrs(datasets_clean)\n", - "# Optionally carry time attrs\n", - "#if 'time' in datasets_interp[0].coords and datasets_interp[0].time.attrs:\n", - "# combined_ds['time'].attrs = dict(datasets_interp[0].time.attrs)\n", - "#combined_ds.attrs =\n" - ] - }, - { - "cell_type": "markdown", - "id": "00b34a46", - "metadata": {}, - "source": [ - "# Combine interpolated datasets" + "print(f\"Instrument depths: {depths}\")\n", + "print(f\"Serial numbers: {serials}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "428070e4", + "id": "db454428", "metadata": {}, "outputs": [], "source": [ - "# List of variables to keep\n", - "vars_to_keep = ['temperature', 'salinity', 'conductivity', 'pressure', 'u_velocity','v_velocity']#,'serial_number','InstrDepth']\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "\n", - "# Remove unwanted variables from each dataset\n", - "datasets_clean = []\n", - "for ds in datasets_interp:\n", - " ds_sel = ds.drop_vars(['density', 'potential_temperature', 'julian_days_offset','timeS'], errors='ignore')\n", - " datasets_clean.append(ds_sel)\n", + "def plot_mooring_temperature(combined_ds, mooring_name):\n", + " \"\"\"Plot temperature records from all instruments on a mooring.\"\"\"\n", "\n", - "# Find union of all time coordinates (should be the same for all, but let's check)\n", - "time_coord = datasets_interp[0]['time']\n", + " # Extract metadata for legend labels\n", + " instruments = combined_ds.instrument_id.values if 'instrument_id' in combined_ds else None\n", + " serials = combined_ds.serial_number.values\n", + " depths = combined_ds.nominal_depth.values\n", "\n", - "# Prepare data arrays for each variable\n", - "combined_data = {}\n", - "N_LEVELS = len(datasets_clean)\n", + " # Get instrument names from global attributes if available\n", + " if 'instrument_names' in combined_ds.attrs:\n", + " instrument_names = combined_ds.attrs['instrument_names'].split(', ')\n", + " else:\n", + " instrument_names = ['unknown'] * len(serials)\n", "\n", - "def first_ds_with(var):\n", - " for d in datasets_clean:\n", - " if var in d:\n", - " return d[var]\n", - " return None\n", - "def stacked_or_nan(var):\n", - " arrs = []\n", - " for ds in datasets_clean:\n", - " if var in ds:\n", - " arrs.append(ds[var].values)\n", - " else:\n", - " arrs.append(np.full(time_coord.shape, np.nan, dtype=float))\n", - " return np.stack(arrs, axis=-1) # (time, N_LEVELS)\n", + " # Create the plot\n", + " fig, ax = plt.subplots(figsize=(12, 6))\n", "\n", - "for var in vars_to_keep:\n", - " stacked = stacked_or_nan(var)\n", - " var_attrs = merge_var_attrs(var, datasets_clean, order=\"last\")\n", - " combined_data[var] = xr.DataArray(\n", - " stacked,\n", - " dims=('time','N_LEVELS'),\n", - " coords={'time': time_coord, 'N_LEVELS': np.arange(N_LEVELS)},\n", - " attrs=var_attrs\n", - " )\n", + " # Plot each instrument's temperature record\n", + " for i in range(combined_ds.dims['N_LEVELS']):\n", + " temp_data = combined_ds.temperature[:, i]\n", "\n", + " # Skip if all NaN\n", + " if np.all(np.isnan(temp_data)):\n", + " continue\n", "\n", - "# Per-level metadata coords\n", - "depths, clock_offsets, serial, instrtype = [], [], [], []\n", - "for ds in datasets_clean:\n", - " depths.append(float(ds['InstrDepth'].item()) if 'InstrDepth' in ds else np.nan)\n", - " serial.append(ds['serial_number'].item() if 'serial_number' in ds else np.nan)\n", - " instrtype.append(ds['instrument'].item() if 'instrument' in ds else 'unknown')\n", - " if 'clock_offset' in ds:\n", - " co = ds['clock_offset'].item()\n", - " elif 'seconds_offset' in ds:\n", - " co = ds['seconds_offset'].item()\n", - " else:\n", - " co = 0\n", - " clock_offsets.append(int(np.rint(co)) if np.isfinite(co) else 0)\n", + " # Create legend label\n", + " if i < len(instrument_names):\n", + " instrument_type = instrument_names[i]\n", + " else:\n", + " instrument_type = 'unknown'\n", "\n", - "combined_ds = xr.Dataset(\n", - " data_vars=combined_data,\n", - " coords={\n", - " 'time': time_coord,\n", - " 'N_LEVELS': np.arange(N_LEVELS),\n", - " 'clock_offset': ('N_LEVELS', np.asarray(clock_offsets)),\n", - " 'serial_number': ('N_LEVELS', np.asarray(serial)),\n", - " 'nominal_depth': ('N_LEVELS', np.asarray(depths)),\n", - " 'instrument': ('N_LEVELS', np.asarray(instrtype)),\n", - " }\n", - ")\n", + " label = f\"{instrument_type}:{int(serials[i])} ({depths[i]:.0f}m)\"\n", "\n", - "# Apply merged GLOBAL attrs (union across all inputs, last dataset wins on conflicts)\n", - "combined_ds.attrs = merge_global_attrs(datasets_clean, order=\"last\")\n", + " # Plot the data\n", + " ax.plot(combined_ds.time, temp_data, label=label, linewidth=0.8)\n", "\n", - "# Optionally also copy coordinate attrs (e.g., 'time' units, calendar)\n", - "copy_coord_attrs(datasets_interp[0], combined_ds, coord_names=(\"time\",))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6cdd3eb", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import xarray as xr\n", - "\n", - "def encode_instrument_as_flags(ds: xr.Dataset,\n", - " var_name: str = \"instrument\",\n", - " out_name: str = \"instrument_id\") -> xr.Dataset:\n", - " if var_name not in ds:\n", - " return ds\n", + " # Formatting\n", + " ax.set_xlabel('Time')\n", + " ax.set_ylabel('Temperature (°C)')\n", + " ax.set_title(f'{mooring_name} - Temperature Records')\n", + " ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " ax.grid(True, alpha=0.3)\n", "\n", - " names = [str(v) for v in np.asarray(ds[var_name].values)]\n", - " uniq = []\n", - " for n in names:\n", - " if n not in uniq:\n", - " uniq.append(n)\n", - " codes = {name: i+1 for i, name in enumerate(uniq)} # start at 1\n", - " id_arr = np.array([codes[n] for n in names], dtype=np.int16)\n", + " # Rotate x-axis labels for better readability\n", + " plt.xticks(rotation=45)\n", + " plt.tight_layout()\n", "\n", - " ds = ds.copy()\n", - " ds[out_name] = ((\"N_LEVELS\",), id_arr)\n", - " # CF style metadata\n", - " ds[out_name].attrs.update({\n", - " \"standard_name\": \"instrument_id\",\n", - " \"long_name\": \"Instrument identifier (encoded)\",\n", - " \"flag_values\": np.array(list(range(1, len(uniq)+1)), dtype=np.int16),\n", - " \"flag_meanings\": \" \".join(s.replace(\" \", \"_\") for s in uniq),\n", - " \"comment\": f\"Mapping: {codes}\"\n", - " })\n", + " return fig, ax\n", "\n", - " # Optional: keep a readable list at global attrs\n", - " ds.attrs[\"instrument_names\"] = \", \".join(uniq)\n", + "# Usage example:\n", + "mooring_name = 'dsE_1_2018'\n", + "if results[mooring_name]:\n", + " combined_file = f\"{basedir}/moor/proc/{mooring_name}/{mooring_name}_mooring_use.nc\"\n", + " combined_ds = xr.open_dataset(combined_file)\n", "\n", - " # Drop the string variable so writers don’t touch it\n", - " ds = ds.drop_vars(var_name)\n", + " fig, ax = plot_mooring_temperature(combined_ds, mooring_name)\n", + " plt.show()\n", "\n", - " return ds\n" + " # Optional: save the plot\n", + " # plt.savefig(f'{mooring_name}_temperature_timeseries.png', dpi=150, bbox_inches='tight')" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c94a4baf", - "metadata": {}, - "outputs": [], - "source": [ - "fname = name1 + '_mooring' + file_subscript + '.nc'\n", - "usefile = proc_dir + '/' + fname\n", - "ds_to_save = encode_instrument_as_flags(combined_ds)\n", - "\n", - "print(usefile)\n", - "if 1:\n", - " writer = NetCdfWriter(ds_to_save)\n", - " writer.write(\n", - " usefile,\n", - " optimize=True,\n", - " drop_derived=False, # drops vars with attrs[\"derived\"] == True (e.g., z)\n", - " uint8_vars=[\n", - " \"correlation_magnitude\", \"echo_intensity\", \"status\", \"percent_good\",\n", - " \"bt_correlation\", \"bt_amplitude\", \"bt_percent_good\",\n", - " ],\n", - " float32_vars=[ # optional explicit list; float32=True already covers floats generically\n", - " \"eastward_velocity\", \"northward_velocity\", \"upward_velocity\",\n", - " \"temperature\", \"salinity\", \"pressure\", \"pressure_std\", \"depth\", \"bt_velocity\",\n", - " ],\n", - " chunk_time=3600, # 1-hour chunks if you have ~1 Hz ensembles; adjust as needed\n", - " complevel=5,\n", - " quantize=3,\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b5430af", - "metadata": {}, - "outputs": [], - "source": [ - "combined_ds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44480db6", - "metadata": {}, - "outputs": [], - "source": [ - "plt.pcolormesh(combined_ds.time,combined_ds.N_LEVELS,combined_ds.temperature.transpose(), cmap='RdYlBu_r')\n", - "vmin = np.nanpercentile(combined_ds.temperature.values, 5)\n", - "vmax = np.nanpercentile(combined_ds.temperature.values, 95)\n", - "plt.clim(vmin, vmax)\n", - "plt.colorbar(label='Temperature')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b51ed88f", - "metadata": {}, - "outputs": [], - "source": [ - "# Now average the temperature in time using a median average\n", - "plt.plot(combined_ds.time,combined_ds.temperature[:,1])\n", - "plt.plot(combined_ds.time,combined_ds.temperature[:,0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89eba7ea", - "metadata": {}, - "outputs": [], - "source": [ - "combined_ds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07c80f38", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/oceanarray/generate_test_data.py b/oceanarray/generate_test_data.py index 4b9fadf..4be559a 100644 --- a/oceanarray/generate_test_data.py +++ b/oceanarray/generate_test_data.py @@ -2,8 +2,10 @@ """ Generate test_data_raw.nc from the real CNV file for Stage 2 testing. """ -import yaml from pathlib import Path + +import yaml + from oceanarray.stage1 import MooringProcessor @@ -32,33 +34,33 @@ def create_test_stage1_data(): # Create YAML configuration yaml_data = { - 'name': 'test_mooring', - 'waterdepth': 1000, - 'longitude': -30.0, - 'latitude': 60.0, - 'deployment_latitude': '60 00.000 N', - 'deployment_longitude': '030 00.000 W', - 'deployment_time': '2018-08-12T08:00:00', # Before data starts - 'recovery_time': '2018-08-26T20:47:24', # After data ends - 'seabed_latitude': '60 00.000 N', - 'seabed_longitude': '030 00.000 W', - 'directory': 'moor/raw/test_deployment/', - 'instruments': [ + "name": "test_mooring", + "waterdepth": 1000, + "longitude": -30.0, + "latitude": 60.0, + "deployment_latitude": "60 00.000 N", + "deployment_longitude": "030 00.000 W", + "deployment_time": "2018-08-12T08:00:00", # Before data starts + "recovery_time": "2018-08-26T20:47:24", # After data ends + "seabed_latitude": "60 00.000 N", + "seabed_longitude": "030 00.000 W", + "directory": "moor/raw/test_deployment/", + "instruments": [ { - 'instrument': 'microcat', - 'serial': 7518, - 'depth': 100, - 'filename': 'test_data.cnv', - 'file_type': 'sbe-cnv', - 'clock_offset': 300, # 5 minutes offset for testing - 'start_time': '2018-08-12T08:00:00', - 'end_time': '2018-08-26T20:47:24' + "instrument": "microcat", + "serial": 7518, + "depth": 100, + "filename": "test_data.cnv", + "file_type": "sbe-cnv", + "clock_offset": 300, # 5 minutes offset for testing + "start_time": "2018-08-12T08:00:00", + "end_time": "2018-08-26T20:47:24", } - ] + ], } config_file = proc_dir / "test_mooring.mooring.yaml" - with open(config_file, 'w') as f: + with open(config_file, "w") as f: yaml.dump(yaml_data, f) print(f"Created YAML config at {config_file}") @@ -83,8 +85,9 @@ def create_test_stage1_data(): # Cleanup temp directory import shutil + shutil.rmtree(test_dir) - print(f"Cleaned up temporary directory") + print("Cleaned up temporary directory") return True else: diff --git a/oceanarray/stage2.py b/oceanarray/stage2.py index 6596f6e..3a6d49b 100644 --- a/oceanarray/stage2.py +++ b/oceanarray/stage2.py @@ -7,14 +7,15 @@ - Trimming data to deployment/recovery time windows - Writing updated NetCDF files with '_use' suffix """ -import os -import yaml + +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + import numpy as np import pandas as pd import xarray as xr -from pathlib import Path -from datetime import datetime -from typing import Dict, List, Optional, Tuple, Any, Union +import yaml from ctd_tools.writers import NetCdfWriter @@ -28,19 +29,19 @@ def __init__(self, base_dir: str): def _setup_logging(self, mooring_name: str, output_path: Path) -> None: """Set up logging for the processing run.""" - log_time = datetime.now().strftime('%Y%m%dT%H') + log_time = datetime.now().strftime("%Y%m%dT%H") self.log_file = output_path / f"{mooring_name}_{log_time}_stage2.mooring.log" def _log_print(self, *args, **kwargs) -> None: """Print to both console and log file.""" print(*args, **kwargs) if self.log_file: - with open(self.log_file, 'a') as f: + with open(self.log_file, "a") as f: print(*args, **kwargs, file=f) def _load_mooring_config(self, config_path: Path) -> Dict[str, Any]: """Load mooring configuration from YAML file.""" - with open(config_path, 'r') as f: + with open(config_path, "r") as f: return yaml.safe_load(f) def _read_yaml_time(self, data: Dict[str, Any], key: str) -> np.datetime64: @@ -53,7 +54,9 @@ def _read_yaml_time(self, data: Dict[str, Any], key: str) -> np.datetime64: except Exception: return np.datetime64("NaT", "ns") - def _apply_clock_offset(self, dataset: xr.Dataset, clock_offset: float) -> xr.Dataset: + def _apply_clock_offset( + self, dataset: xr.Dataset, clock_offset: float + ) -> xr.Dataset: """Apply clock offset correction to dataset time coordinate.""" if clock_offset == 0: return dataset @@ -64,17 +67,20 @@ def _apply_clock_offset(self, dataset: xr.Dataset, clock_offset: float) -> xr.Da result = dataset.copy() # Add clock offset as a variable - result['clock_offset'] = clock_offset - result['clock_offset'].attrs['units'] = 's' + result["clock_offset"] = clock_offset + result["clock_offset"].attrs["units"] = "s" # Apply the correction to time coordinate - result['time'] = result['time'] + np.timedelta64(int(clock_offset), 's') + result["time"] = result["time"] + np.timedelta64(int(clock_offset), "s") return result - def _trim_to_deployment_window(self, dataset: xr.Dataset, - deploy_time: np.datetime64, - recover_time: np.datetime64) -> xr.Dataset: + def _trim_to_deployment_window( + self, + dataset: xr.Dataset, + deploy_time: np.datetime64, + recover_time: np.datetime64, + ) -> xr.Dataset: """Trim dataset to deployment time window.""" original_size = len(dataset.time) @@ -96,26 +102,27 @@ def _trim_to_deployment_window(self, dataset: xr.Dataset, return dataset - def _add_missing_metadata(self, dataset: xr.Dataset, - instrument_config: Dict[str, Any]) -> xr.Dataset: + def _add_missing_metadata( + self, dataset: xr.Dataset, instrument_config: Dict[str, Any] + ) -> xr.Dataset: """Add any missing metadata variables to dataset.""" # Add instrument depth if missing - if 'InstrDepth' not in dataset.variables and 'depth' in instrument_config: - dataset['InstrDepth'] = instrument_config['depth'] + if "InstrDepth" not in dataset.variables and "depth" in instrument_config: + dataset["InstrDepth"] = instrument_config["depth"] # Add instrument type if missing - if 'instrument' not in dataset.variables and 'instrument' in instrument_config: - dataset['instrument'] = instrument_config['instrument'] + if "instrument" not in dataset.variables and "instrument" in instrument_config: + dataset["instrument"] = instrument_config["instrument"] # Add serial number if missing - if 'serial_number' not in dataset.variables and 'serial' in instrument_config: - dataset['serial_number'] = instrument_config['serial'] + if "serial_number" not in dataset.variables and "serial" in instrument_config: + dataset["serial_number"] = instrument_config["serial"] return dataset def _clean_unnecessary_variables(self, dataset: xr.Dataset) -> xr.Dataset: """Remove variables that are not needed in the final product.""" - vars_to_remove = ['timeS'] # Add other variables as needed + vars_to_remove = ["timeS"] # Add other variables as needed for var in vars_to_remove: if var in dataset.variables: @@ -127,29 +134,45 @@ def _clean_unnecessary_variables(self, dataset: xr.Dataset) -> xr.Dataset: def _get_netcdf_writer_params(self) -> Dict[str, Any]: """Get standard parameters for NetCDF writer.""" return { - 'optimize': True, - 'drop_derived': False, - 'uint8_vars': [ - "correlation_magnitude", "echo_intensity", "status", "percent_good", - "bt_correlation", "bt_amplitude", "bt_percent_good", + "optimize": True, + "drop_derived": False, + "uint8_vars": [ + "correlation_magnitude", + "echo_intensity", + "status", + "percent_good", + "bt_correlation", + "bt_amplitude", + "bt_percent_good", ], - 'float32_vars': [ - "eastward_velocity", "northward_velocity", "upward_velocity", - "temperature", "salinity", "pressure", "pressure_std", "depth", "bt_velocity", + "float32_vars": [ + "eastward_velocity", + "northward_velocity", + "upward_velocity", + "temperature", + "salinity", + "pressure", + "pressure_std", + "depth", + "bt_velocity", ], - 'chunk_time': 3600, - 'complevel': 5, - 'quantize': 3, + "chunk_time": 3600, + "complevel": 5, + "quantize": 3, } - def _process_instrument(self, instrument_config: Dict[str, Any], - mooring_config: Dict[str, Any], - proc_dir: Path, mooring_name: str, - deploy_time: np.datetime64, - recover_time: np.datetime64) -> bool: + def _process_instrument( + self, + instrument_config: Dict[str, Any], + mooring_config: Dict[str, Any], + proc_dir: Path, + mooring_name: str, + deploy_time: np.datetime64, + recover_time: np.datetime64, + ) -> bool: """Process a single instrument's Stage 1 output.""" - serial = instrument_config.get('serial', 'unknown') - instrument_type = instrument_config.get('instrument', 'unknown') + serial = instrument_config.get("serial", "unknown") + instrument_type = instrument_config.get("instrument", "unknown") # Construct file paths raw_filename = f"{mooring_name}_{serial}_raw.nc" @@ -177,19 +200,23 @@ def _process_instrument(self, instrument_config: Dict[str, Any], dataset = self._clean_unnecessary_variables(dataset) # Apply clock offset - clock_offset = instrument_config.get('clock_offset', 0) + clock_offset = instrument_config.get("clock_offset", 0) dataset = self._apply_clock_offset(dataset, clock_offset) # Trim to deployment window - dataset = self._trim_to_deployment_window(dataset, deploy_time, recover_time) + dataset = self._trim_to_deployment_window( + dataset, deploy_time, recover_time + ) if len(dataset.time) == 0: - self._log_print(f"ERROR: No data remains after processing {instrument_type} {serial}") + self._log_print( + f"ERROR: No data remains after processing {instrument_type} {serial}" + ) return False # Log time range - start_time = dataset['time'].values.min() - end_time = dataset['time'].values.max() + start_time = dataset["time"].values.min() + end_time = dataset["time"].values.max() self._log_print(f"Final time range: {start_time} to {end_time}") # Remove existing output file if it exists @@ -209,8 +236,9 @@ def _process_instrument(self, instrument_config: Dict[str, Any], self._log_print(f"ERROR processing {instrument_type} {serial}: {e}") return False - def process_mooring(self, mooring_name: str, - output_path: Optional[str] = None) -> bool: + def process_mooring( + self, mooring_name: str, output_path: Optional[str] = None + ) -> bool: """ Process Stage 2 for a single mooring. @@ -223,7 +251,7 @@ def process_mooring(self, mooring_name: str, """ # Set up paths if output_path is None: - proc_dir = self.base_dir / 'moor' / 'proc' / mooring_name + proc_dir = self.base_dir / "moor" / "proc" / mooring_name else: proc_dir = Path(output_path) / mooring_name @@ -256,22 +284,29 @@ def process_mooring(self, mooring_name: str, # Process each instrument success_count = 0 - total_count = len(mooring_config.get('instruments', [])) + total_count = len(mooring_config.get("instruments", [])) - for instrument_config in mooring_config.get('instruments', []): + for instrument_config in mooring_config.get("instruments", []): success = self._process_instrument( - instrument_config, mooring_config, proc_dir, - mooring_name, deploy_time, recover_time + instrument_config, + mooring_config, + proc_dir, + mooring_name, + deploy_time, + recover_time, ) if success: success_count += 1 - self._log_print(f"Stage 2 completed: {success_count}/{total_count} instruments successful") + self._log_print( + f"Stage 2 completed: {success_count}/{total_count} instruments successful" + ) return success_count > 0 -def stage2_mooring(mooring_name: str, basedir: str, - output_path: Optional[str] = None) -> bool: +def stage2_mooring( + mooring_name: str, basedir: str, output_path: Optional[str] = None +) -> bool: """ Process Stage 2 for a single mooring (backwards compatibility function). @@ -287,8 +322,9 @@ def stage2_mooring(mooring_name: str, basedir: str, return processor.process_mooring(mooring_name, output_path) -def process_multiple_moorings_stage2(mooring_list: List[str], - basedir: str) -> Dict[str, bool]: +def process_multiple_moorings_stage2( + mooring_list: List[str], basedir: str +) -> Dict[str, bool]: """ Process Stage 2 for multiple moorings. @@ -315,9 +351,9 @@ def process_multiple_moorings_stage2(mooring_list: List[str], # Example usage if __name__ == "__main__": # Your mooring list - moorlist = ['dsE_1_2018'] + moorlist = ["dsE_1_2018"] - basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/' + basedir = "/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/" # Process all moorings results = process_multiple_moorings_stage2(moorlist, basedir) diff --git a/oceanarray/stage3.py b/oceanarray/stage3.py new file mode 100644 index 0000000..e8883cc --- /dev/null +++ b/oceanarray/stage3.py @@ -0,0 +1,584 @@ +""" +Stage 3 processing for mooring data: Combine individual instruments into one xarray dataset. + +This module handles: +- Loading processed Stage 2 NetCDF files (_use.nc) +- Interpolating all instruments onto a common time grid +- Combining instruments into a single dataset with N_LEVELS dimension +- Encoding instrument metadata as coordinate arrays +- Writing combined mooring datasets + +Version: 1.0 +Last updated: 2025-09-07 +""" +import os +import yaml +import numpy as np +import pandas as pd +import xarray as xr +from pathlib import Path +from datetime import datetime +from typing import Dict, List, Optional, Tuple, Any, Union +from ctd_tools.writers import NetCdfWriter + + +class Stage3Processor: + """Handles Stage 3 processing: combining instruments into single dataset.""" + + def __init__(self, base_dir: str): + """Initialize processor with base directory.""" + self.base_dir = Path(base_dir) + self.log_file = None + + def _setup_logging(self, mooring_name: str, output_path: Path) -> None: + """Set up logging for the processing run.""" + log_time = datetime.now().strftime('%Y%m%dT%H') + self.log_file = output_path / f"{mooring_name}_{log_time}_stage3.mooring.log" + + def _log_print(self, *args, **kwargs) -> None: + """Print to both console and log file.""" + print(*args, **kwargs) + if self.log_file: + with open(self.log_file, 'a') as f: + print(*args, **kwargs, file=f) + + def _load_mooring_config(self, config_path: Path) -> Dict[str, Any]: + """Load mooring configuration from YAML file.""" + with open(config_path, 'r') as f: + return yaml.safe_load(f) + + def _load_instrument_datasets(self, mooring_config: Dict[str, Any], + proc_dir: Path, file_suffix: str = '_use') -> List[xr.Dataset]: + """Load all instrument datasets for a mooring.""" + datasets = [] + mooring_name = mooring_config['name'] + expected_instruments = mooring_config.get('instruments', []) + found_instruments = [] + missing_instruments = [] + + for instrument_config in expected_instruments: + instrument_type = instrument_config.get('instrument', 'unknown') + serial = instrument_config.get('serial', 0) + + # Construct file path + filename = f"{mooring_name}_{serial}{file_suffix}.nc" + filepath = proc_dir / instrument_type / filename + + if not filepath.exists(): + self._log_print(f"WARNING: File not found: {filepath}") + missing_instruments.append(f"{instrument_type}:{serial}") + continue + + try: + self._log_print(f"Loading {instrument_type} serial {serial}: {filename}") + ds = xr.open_dataset(filepath) + + # Ensure required metadata is present + ds = self._ensure_instrument_metadata(ds, instrument_config) + + # Clean unnecessary variables + ds = self._clean_dataset_variables(ds) + + datasets.append(ds) + found_instruments.append(f"{instrument_type}:{serial}") + + except Exception as e: + self._log_print(f"ERROR loading {filepath}: {e}") + missing_instruments.append(f"{instrument_type}:{serial}") + continue + + # Report on missing instruments + if missing_instruments: + self._log_print(f"") + self._log_print(f"WARNING: Missing instruments compared to YAML configuration:") + for missing in missing_instruments: + self._log_print(f" - {missing}") + self._log_print(f" Expected {len(expected_instruments)}, found {len(found_instruments)}") + self._log_print(f"") + else: + self._log_print(f"All {len(expected_instruments)} instruments from YAML found and loaded") + + return datasets + + def _ensure_instrument_metadata(self, dataset: xr.Dataset, + instrument_config: Dict[str, Any]) -> xr.Dataset: + """Ensure all required metadata is present in dataset.""" + # Add missing metadata from config + if 'InstrDepth' not in dataset.variables and 'depth' in instrument_config: + dataset['InstrDepth'] = instrument_config['depth'] + + if 'instrument' not in dataset.variables and 'instrument' in instrument_config: + dataset['instrument'] = instrument_config['instrument'] + + if 'serial_number' not in dataset.variables and 'serial' in instrument_config: + dataset['serial_number'] = instrument_config['serial'] + + return dataset + + def _clean_dataset_variables(self, dataset: xr.Dataset) -> xr.Dataset: + """Remove unnecessary variables from dataset.""" + vars_to_remove = ['timeS', 'density', 'potential_temperature', 'julian_days_offset'] + + for var in vars_to_remove: + if var in dataset.variables: + dataset = dataset.drop_vars(var, errors='ignore') + + return dataset + + def _analyze_timing_info(self, datasets: List[xr.Dataset]) -> Tuple[np.ndarray, np.datetime64, np.datetime64]: + """Analyze timing information across all datasets.""" + intervals_min = [] + start_times = [] + end_times = [] + timing_details = [] + + for idx, ds in enumerate(datasets): + if 'time' not in ds.sizes or ds.sizes['time'] <= 1: + self._log_print(f"WARNING: Dataset {idx} has insufficient time data") + continue + + time = ds['time'] + start_time = time.values[0] + end_time = time.values[-1] + + # Calculate time intervals (in minutes) + time_diffs = np.diff(time.values) / np.timedelta64(1, 'm') + time_interval_median = np.nanmedian(time_diffs) + time_interval_min = np.nanmin(time_diffs) + time_interval_max = np.nanmax(time_diffs) + time_interval_std = np.nanstd(time_diffs) + + # Format interval string + if time_interval_median > 1: + tstr = f"{time_interval_median:.2f} min" + tmin_str = f"{time_interval_min:.2f} min" + tmax_str = f"{time_interval_max:.2f} min" + tstd_str = f"{time_interval_std:.2f} min" + else: + tstr = f"{time_interval_median * 60:.2f} sec" + tmin_str = f"{time_interval_min * 60:.2f} sec" + tmax_str = f"{time_interval_max * 60:.2f} sec" + tstd_str = f"{time_interval_std * 60:.2f} sec" + + variables = list(ds.data_vars) + depth = ds['InstrDepth'].values if 'InstrDepth' in ds else 'unknown' + instrument = ds['instrument'].values if 'instrument' in ds else 'unknown' + serial = ds['serial_number'].values if 'serial_number' in ds else 'unknown' + + self._log_print(f"Dataset {idx} depth {depth} [{instrument}:{serial}]:") + self._log_print(f" Start: {str(start_time)[:19]}, End: {str(end_time)[:19]}") + self._log_print(f" Time interval - Median: {tstr}, Range: {tmin_str} to {tmax_str}, Std: {tstd_str}") + self._log_print(f" Variables: {variables}") + + # Store timing details for later analysis + timing_details.append({ + 'idx': idx, + 'instrument': f"{instrument}:{serial}", + 'depth': depth, + 'median_interval_min': time_interval_median, + 'min_interval_min': time_interval_min, + 'max_interval_min': time_interval_max, + 'std_interval_min': time_interval_std, + 'n_points': len(time) + }) + + intervals_min.append(time_interval_median) + start_times.append(start_time) + end_times.append(end_time) + + if not start_times: + raise ValueError("No valid datasets with time information found") + + earliest_start = min(start_times) + latest_end = max(end_times) + median_interval = np.nanmedian(intervals_min) + + # Analyze timing consistency and warn about issues + self._log_print(f"") + self._log_print(f"TIMING ANALYSIS:") + self._log_print(f" Overall median interval: {median_interval:.2f} min") + self._log_print(f" Range of median intervals: {np.min(intervals_min):.2f} to {np.max(intervals_min):.2f} min") + + # Check for large differences in sampling rates + interval_ratio = np.max(intervals_min) / np.min(intervals_min) + if interval_ratio > 2.0: + self._log_print(f" WARNING: Large differences in sampling rates detected!") + self._log_print(f" WARNING: Ratio of slowest to fastest: {interval_ratio:.1f}x") + self._log_print(f" WARNING: This may cause significant interpolation artifacts") + + # Check for irregular sampling within instruments + for detail in timing_details: + if detail['std_interval_min'] > 0.1 * detail['median_interval_min']: # >10% variation + self._log_print(f" WARNING: Irregular sampling in {detail['instrument']}") + self._log_print(f" Std dev ({detail['std_interval_min']:.2f} min) > 10% of median ({detail['median_interval_min']:.2f} min)") + + # Report on interpolation target + self._log_print(f" Common grid will use {median_interval:.2f} min intervals") + for detail in timing_details: + diff_pct = abs(detail['median_interval_min'] - median_interval) / median_interval * 100 + if diff_pct > 10: + status = "SIGNIFICANT CHANGE" + elif diff_pct > 1: + status = "MINOR CHANGE" + else: + status = "MINIMAL CHANGE" + self._log_print(f" {detail['instrument']}: {detail['median_interval_min']:.2f} min -> {median_interval:.2f} min ({diff_pct:.1f}% change) {status}") + + # Create common time grid + time_grid = np.arange( + earliest_start, + latest_end, + np.timedelta64(int(median_interval * 60), 's') + ) + + self._log_print(f" Common time grid: {len(time_grid)} points from {time_grid[0]} to {time_grid[-1]}") + self._log_print(f"") + + return time_grid, earliest_start, latest_end + + def _interpolate_datasets(self, datasets: List[xr.Dataset], + time_grid: np.ndarray) -> List[xr.Dataset]: + """Interpolate all datasets onto common time grid.""" + datasets_interp = [] + + with xr.set_options(keep_attrs=True): + for idx, ds in enumerate(datasets): + if 'time' not in ds.sizes or ds.sizes['time'] <= 1: + self._log_print(f"Skipping dataset {idx}: insufficient time data") + continue + + try: + # Interpolate the whole dataset at once + ds_i = ds.interp(time=time_grid) + + # Preserve global attributes (Dataset.interp can drop them) + ds_i.attrs = dict(ds.attrs) + + # Keep coordinate attributes + if 'time' in ds.coords and ds.time.attrs: + ds_i.time.attrs = dict(ds.time.attrs) + + # Add extra coordinates as needed + for coord_name in ['InstrDepth', 'clock_offset', 'serial_number', 'instrument']: + if coord_name in ds: + ds_i = ds_i.assign_coords({coord_name: ds[coord_name]}) + + datasets_interp.append(ds_i) + self._log_print(f"Successfully interpolated dataset {idx}") + + except Exception as e: + self._log_print(f"ERROR interpolating dataset {idx}: {e}") + continue + + return datasets_interp + + def _merge_global_attrs(self, ds_list: List[xr.Dataset], order: str = "last") -> Dict[str, Any]: + """Union of all global attrs across datasets.""" + merged = {} + if order == "last": + it = ds_list + else: # 'first' + it = reversed(ds_list) + for ds in it: + if hasattr(ds, "attrs") and ds.attrs: + merged.update(dict(ds.attrs)) + return merged + + def _merge_var_attrs(self, varname: str, ds_list: List[xr.Dataset], + order: str = "last") -> Dict[str, Any]: + """Union of attrs for a given variable across datasets.""" + merged = {} + if order == "last": + it = ds_list + else: + it = reversed(ds_list) + for ds in it: + if varname in ds and getattr(ds[varname], "attrs", None): + merged.update(dict(ds[varname].attrs)) + return merged + + def _create_combined_dataset(self, datasets_interp: List[xr.Dataset], + time_grid: np.ndarray, + vars_to_keep: List[str] = None) -> xr.Dataset: + """Combine interpolated datasets into single dataset with N_LEVELS dimension.""" + if vars_to_keep is None: + vars_to_keep = ['temperature', 'salinity', 'conductivity', 'pressure', + 'u_velocity', 'v_velocity'] + + if not datasets_interp: + raise ValueError("No interpolated datasets provided") + + time_coord = datasets_interp[0]['time'] + n_levels = len(datasets_interp) + + # Helper functions + def stacked_or_nan(var): + """Stack variable across all datasets, filling with NaN if missing.""" + arrs = [] + for ds in datasets_interp: + if var in ds: + arrs.append(ds[var].values) + else: + arrs.append(np.full(time_coord.shape, np.nan, dtype=float)) + return np.stack(arrs, axis=-1) # (time, N_LEVELS) + + # Create combined data variables + combined_data = {} + for var in vars_to_keep: + # Check if any dataset has this variable + if not any(var in ds for ds in datasets_interp): + self._log_print(f"Variable '{var}' not found in any dataset, skipping") + continue + + stacked = stacked_or_nan(var) + var_attrs = self._merge_var_attrs(var, datasets_interp, order="last") + combined_data[var] = xr.DataArray( + stacked, + dims=('time', 'N_LEVELS'), + coords={'time': time_coord, 'N_LEVELS': np.arange(n_levels)}, + attrs=var_attrs + ) + + # Extract per-level metadata + depths, clock_offsets, serial_nums, instrument_types = [], [], [], [] + for ds in datasets_interp: + depths.append(float(ds['InstrDepth'].item()) if 'InstrDepth' in ds else np.nan) + serial_nums.append(ds['serial_number'].item() if 'serial_number' in ds else np.nan) + instrument_types.append(ds['instrument'].item() if 'instrument' in ds else 'unknown') + + # Handle different clock offset variable names + if 'clock_offset' in ds: + co = ds['clock_offset'].item() + elif 'seconds_offset' in ds: + co = ds['seconds_offset'].item() + else: + co = 0 + clock_offsets.append(int(np.rint(co)) if np.isfinite(co) else 0) + + # Create combined dataset + combined_ds = xr.Dataset( + data_vars=combined_data, + coords={ + 'time': time_coord, + 'N_LEVELS': np.arange(n_levels), + 'clock_offset': ('N_LEVELS', np.asarray(clock_offsets)), + 'serial_number': ('N_LEVELS', np.asarray(serial_nums)), + 'nominal_depth': ('N_LEVELS', np.asarray(depths)), + 'instrument': ('N_LEVELS', np.asarray(instrument_types)), + } + ) + + # Apply merged global attributes + combined_ds.attrs = self._merge_global_attrs(datasets_interp, order="last") + + # Copy coordinate attributes + if 'time' in datasets_interp[0].coords and datasets_interp[0].time.attrs: + combined_ds['time'].attrs = dict(datasets_interp[0].time.attrs) + + return combined_ds + + def _encode_instrument_as_flags(self, ds: xr.Dataset, + var_name: str = "instrument", + out_name: str = "instrument_id") -> xr.Dataset: + """Encode instrument names as integer flags for NetCDF compatibility.""" + if var_name not in ds: + return ds + + names = [str(v) for v in np.asarray(ds[var_name].values)] + uniq = [] + for n in names: + if n not in uniq: + uniq.append(n) + + codes = {name: i+1 for i, name in enumerate(uniq)} # start at 1 + id_arr = np.array([codes[n] for n in names], dtype=np.int16) + + ds = ds.copy() + ds[out_name] = (("N_LEVELS",), id_arr) + + # CF style metadata + ds[out_name].attrs.update({ + "standard_name": "instrument_id", + "long_name": "Instrument identifier (encoded)", + "flag_values": np.array(list(range(1, len(uniq)+1)), dtype=np.int16), + "flag_meanings": " ".join(s.replace(" ", "_") for s in uniq), + "comment": f"Mapping: {codes}" + }) + + # Keep readable list at global attrs + ds.attrs["instrument_names"] = ", ".join(uniq) + + # Drop the string variable + ds = ds.drop_vars(var_name) + + return ds + + def _get_netcdf_writer_params(self) -> Dict[str, Any]: + """Get standard parameters for NetCDF writer.""" + return { + 'optimize': True, + 'drop_derived': False, + 'uint8_vars': [ + "correlation_magnitude", "echo_intensity", "status", "percent_good", + "bt_correlation", "bt_amplitude", "bt_percent_good", + ], + 'float32_vars': [ + "eastward_velocity", "northward_velocity", "upward_velocity", + "temperature", "salinity", "pressure", "pressure_std", "depth", "bt_velocity", + ], + 'chunk_time': 3600, + 'complevel': 5, + 'quantize': 3, + } + + def process_mooring(self, mooring_name: str, + output_path: Optional[str] = None, + file_suffix: str = '_use', + vars_to_keep: List[str] = None) -> bool: + """ + Process Stage 3 for a single mooring: combine instruments into single dataset. + + Args: + mooring_name: Name of the mooring to process + output_path: Optional custom output path + file_suffix: Suffix for input files ('_use' or '_raw') + vars_to_keep: List of variables to include in combined dataset + + Returns: + bool: True if processing completed successfully + """ + # Set up paths + if output_path is None: + proc_dir = self.base_dir / 'moor' / 'proc' / mooring_name + else: + proc_dir = Path(output_path) / mooring_name + + if not proc_dir.exists(): + print(f"ERROR: Processing directory not found: {proc_dir}") + return False + + # Set up logging + self._setup_logging(mooring_name, proc_dir) + self._log_print(f"Starting Stage 3 processing for mooring: {mooring_name}") + self._log_print(f"Using files with suffix: {file_suffix}") + + # Load configuration + config_file = proc_dir / f"{mooring_name}.mooring.yaml" + if not config_file.exists(): + self._log_print(f"ERROR: Configuration file not found: {config_file}") + return False + + try: + mooring_config = self._load_mooring_config(config_file) + except Exception as e: + self._log_print(f"ERROR: Failed to load configuration: {e}") + return False + + # Load instrument datasets + datasets = self._load_instrument_datasets(mooring_config, proc_dir, file_suffix) + + if not datasets: + self._log_print(f"ERROR: No valid datasets found for {mooring_name}") + return False + + self._log_print(f"Loaded {len(datasets)} instrument datasets") + + try: + # Analyze timing and create common grid + time_grid, start_time, end_time = self._analyze_timing_info(datasets) + + # Interpolate datasets onto common grid + datasets_interp = self._interpolate_datasets(datasets, time_grid) + + if not datasets_interp: + self._log_print(f"ERROR: No datasets could be interpolated") + return False + + # Combine into single dataset + combined_ds = self._create_combined_dataset(datasets_interp, time_grid, vars_to_keep) + + # Encode instrument names as flags + ds_to_save = self._encode_instrument_as_flags(combined_ds) + + # Write output file + output_filename = f"{mooring_name}_mooring{file_suffix}.nc" + output_filepath = proc_dir / output_filename + + writer = NetCdfWriter(ds_to_save) + writer_params = self._get_netcdf_writer_params() + writer.write(str(output_filepath), **writer_params) + + self._log_print(f"Successfully wrote combined dataset: {output_filepath}") + self._log_print(f"Combined dataset shape: {dict(ds_to_save.dims)}") + self._log_print(f"Variables: {list(ds_to_save.data_vars)}") + + return True + + except Exception as e: + self._log_print(f"ERROR during Stage 3 processing: {e}") + return False + + +def stage3_mooring(mooring_name: str, basedir: str, + output_path: Optional[str] = None, + file_suffix: str = '_use') -> bool: + """ + Process Stage 3 for a single mooring (backwards compatibility function). + + Args: + mooring_name: Name of the mooring to process + basedir: Base directory containing the data + output_path: Optional output path override + file_suffix: Suffix for input files ('_use' or '_raw') + + Returns: + bool: True if processing completed successfully + """ + processor = Stage3Processor(basedir) + return processor.process_mooring(mooring_name, output_path, file_suffix) + + +def process_multiple_moorings_stage3(mooring_list: List[str], + basedir: str, + file_suffix: str = '_use') -> Dict[str, bool]: + """ + Process Stage 3 for multiple moorings. + + Args: + mooring_list: List of mooring names to process + basedir: Base directory containing the data + file_suffix: Suffix for input files ('_use' or '_raw') + + Returns: + Dict mapping mooring names to success status + """ + processor = Stage3Processor(basedir) + results = {} + + for mooring_name in mooring_list: + print(f"\n{'='*50}") + print(f"Processing Stage 3 for mooring {mooring_name}") + print(f"{'='*50}") + + results[mooring_name] = processor.process_mooring(mooring_name, file_suffix=file_suffix) + + return results + + +# Example usage +if __name__ == "__main__": + # Your mooring list + moorlist = ['dsE_1_2018'] + + basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/' + + # Process all moorings + results = process_multiple_moorings_stage3(moorlist, basedir) + + # Print summary + print(f"\n{'='*50}") + print("STAGE 3 PROCESSING SUMMARY") + print(f"{'='*50}") + for mooring, success in results.items(): + status = "SUCCESS" if success else "FAILED" + print(f"{mooring}: {status}") diff --git a/tests/test_stage2.py b/tests/test_stage2.py index f42716d..4044de1 100644 --- a/tests/test_stage2.py +++ b/tests/test_stage2.py @@ -10,17 +10,20 @@ - Added verification that original datasets are not modified - Fixed date ranges in trimming tests to match actual data """ -import pytest + import tempfile -import yaml +from pathlib import Path +from unittest.mock import Mock, patch + import numpy as np import pandas as pd +import pytest import xarray as xr -from pathlib import Path -from unittest.mock import Mock, patch -from datetime import datetime, timedelta +import yaml -from oceanarray.stage2 import Stage2Processor, stage2_mooring, process_multiple_moorings_stage2 +from oceanarray.stage2 import (Stage2Processor, + process_multiple_moorings_stage2, + stage2_mooring) class TestStage2Processor: @@ -41,44 +44,44 @@ def processor(self, temp_dir): def sample_yaml_data(self): """Sample YAML configuration data for Stage 2.""" return { - 'name': 'test_mooring', - 'waterdepth': 1000, - 'longitude': -30.0, - 'latitude': 60.0, - 'deployment_time': '2018-08-12T09:00:00', - 'recovery_time': '2018-08-26T19:00:00', - 'instruments': [ + "name": "test_mooring", + "waterdepth": 1000, + "longitude": -30.0, + "latitude": 60.0, + "deployment_time": "2018-08-12T09:00:00", + "recovery_time": "2018-08-26T19:00:00", + "instruments": [ { - 'instrument': 'microcat', - 'serial': 7518, - 'depth': 100, - 'clock_offset': 300 # 5 minutes + "instrument": "microcat", + "serial": 7518, + "depth": 100, + "clock_offset": 300, # 5 minutes } - ] + ], } @pytest.fixture def sample_raw_dataset(self): """Create a sample raw dataset for testing.""" # Create time series with some data before/after deployment window - start_time = pd.to_datetime('2018-08-12T08:00:00') - end_time = pd.to_datetime('2018-08-26T20:00:00') - time_range = pd.date_range(start_time, end_time, freq='10min') + start_time = pd.to_datetime("2018-08-12T08:00:00") + end_time = pd.to_datetime("2018-08-26T20:00:00") + time_range = pd.date_range(start_time, end_time, freq="10min") data = { - 'temperature': (['time'], np.random.random(len(time_range)) + 20), - 'salinity': (['time'], np.random.random(len(time_range)) + 35), - 'pressure': (['time'], np.random.random(len(time_range)) + 100), - 'timeS': (['time'], np.arange(len(time_range))), # To be removed + "temperature": (["time"], np.random.random(len(time_range)) + 20), + "salinity": (["time"], np.random.random(len(time_range)) + 35), + "pressure": (["time"], np.random.random(len(time_range)) + 100), + "timeS": (["time"], np.arange(len(time_range))), # To be removed } - ds = xr.Dataset(data, coords={'time': time_range}) + ds = xr.Dataset(data, coords={"time": time_range}) # Add some metadata - ds.attrs['mooring_name'] = 'test_mooring' - ds['serial_number'] = 7518 - ds['instrument'] = 'microcat' - ds['InstrDepth'] = 100 + ds.attrs["mooring_name"] = "test_mooring" + ds["serial_number"] = 7518 + ds["instrument"] = "microcat" + ds["InstrDepth"] = 100 return ds @@ -103,27 +106,27 @@ def test_setup_logging(self, processor, temp_dir): def test_read_yaml_time_valid(self, processor): """Test reading valid time from YAML.""" - data = {'deployment_time': '2018-08-12T09:00:00'} - result = processor._read_yaml_time(data, 'deployment_time') - expected = pd.to_datetime('2018-08-12T09:00:00').to_datetime64() + data = {"deployment_time": "2018-08-12T09:00:00"} + result = processor._read_yaml_time(data, "deployment_time") + expected = pd.to_datetime("2018-08-12T09:00:00").to_datetime64() assert result == expected def test_read_yaml_time_missing(self, processor): """Test reading missing time from YAML.""" data = {} - result = processor._read_yaml_time(data, 'deployment_time') + result = processor._read_yaml_time(data, "deployment_time") assert pd.isna(result) def test_read_yaml_time_empty_string(self, processor): """Test reading empty string time from YAML.""" - data = {'deployment_time': ''} - result = processor._read_yaml_time(data, 'deployment_time') + data = {"deployment_time": ""} + result = processor._read_yaml_time(data, "deployment_time") assert pd.isna(result) def test_read_yaml_time_invalid(self, processor): """Test reading invalid time from YAML.""" - data = {'deployment_time': 'not-a-date'} - result = processor._read_yaml_time(data, 'deployment_time') + data = {"deployment_time": "not-a-date"} + result = processor._read_yaml_time(data, "deployment_time") assert pd.isna(result) def test_apply_clock_offset_zero(self, processor, sample_raw_dataset): @@ -135,7 +138,9 @@ def test_apply_clock_offset_zero(self, processor, sample_raw_dataset): np.testing.assert_array_equal(result.time.values, original_time.values) # Verify original dataset was not modified - np.testing.assert_array_equal(sample_raw_dataset.time.values, original_time.values) + np.testing.assert_array_equal( + sample_raw_dataset.time.values, original_time.values + ) def test_apply_clock_offset_positive(self, processor, sample_raw_dataset): """Test applying positive clock offset - Version 2.0 with immutability testing.""" @@ -144,16 +149,18 @@ def test_apply_clock_offset_positive(self, processor, sample_raw_dataset): result = processor._apply_clock_offset(sample_raw_dataset, offset_seconds) # Check that clock_offset variable was added - assert 'clock_offset' in result.variables - assert result['clock_offset'].values == offset_seconds - assert result['clock_offset'].attrs['units'] == 's' + assert "clock_offset" in result.variables + assert result["clock_offset"].values == offset_seconds + assert result["clock_offset"].attrs["units"] == "s" # Check that time was shifted forward - expected_time = original_time + np.timedelta64(offset_seconds, 's') + expected_time = original_time + np.timedelta64(offset_seconds, "s") np.testing.assert_array_equal(result.time.values, expected_time.values) # Verify original dataset was not modified - np.testing.assert_array_equal(sample_raw_dataset.time.values, original_time.values) + np.testing.assert_array_equal( + sample_raw_dataset.time.values, original_time.values + ) def test_apply_clock_offset_negative(self, processor, sample_raw_dataset): """Test applying negative clock offset - Version 2.0 with immutability testing.""" @@ -162,16 +169,18 @@ def test_apply_clock_offset_negative(self, processor, sample_raw_dataset): result = processor._apply_clock_offset(sample_raw_dataset, offset_seconds) # Check that time was shifted backward - expected_time = original_time + np.timedelta64(offset_seconds, 's') + expected_time = original_time + np.timedelta64(offset_seconds, "s") np.testing.assert_array_equal(result.time.values, expected_time.values) # Verify original dataset was not modified - np.testing.assert_array_equal(sample_raw_dataset.time.values, original_time.values) + np.testing.assert_array_equal( + sample_raw_dataset.time.values, original_time.values + ) def test_trim_to_deployment_window_both_bounds(self, processor, sample_raw_dataset): """Test trimming with both deployment and recovery times.""" - deploy_time = np.datetime64('2018-08-12T10:00:00') - recover_time = np.datetime64('2018-08-26T18:00:00') + deploy_time = np.datetime64("2018-08-12T10:00:00") + recover_time = np.datetime64("2018-08-26T18:00:00") result = processor._trim_to_deployment_window( sample_raw_dataset, deploy_time, recover_time @@ -184,8 +193,8 @@ def test_trim_to_deployment_window_both_bounds(self, processor, sample_raw_datas def test_trim_to_deployment_window_deploy_only(self, processor, sample_raw_dataset): """Test trimming with only deployment time.""" - deploy_time = np.datetime64('2018-08-12T10:00:00') - recover_time = np.datetime64('NaT') + deploy_time = np.datetime64("2018-08-12T10:00:00") + recover_time = np.datetime64("NaT") result = processor._trim_to_deployment_window( sample_raw_dataset, deploy_time, recover_time @@ -195,10 +204,12 @@ def test_trim_to_deployment_window_deploy_only(self, processor, sample_raw_datas assert result.time.min() >= deploy_time assert result.time.max() == sample_raw_dataset.time.max() - def test_trim_to_deployment_window_recover_only(self, processor, sample_raw_dataset): + def test_trim_to_deployment_window_recover_only( + self, processor, sample_raw_dataset + ): """Test trimming with only recovery time.""" - deploy_time = np.datetime64('NaT') - recover_time = np.datetime64('2018-08-26T18:00:00') + deploy_time = np.datetime64("NaT") + recover_time = np.datetime64("2018-08-26T18:00:00") result = processor._trim_to_deployment_window( sample_raw_dataset, deploy_time, recover_time @@ -210,8 +221,8 @@ def test_trim_to_deployment_window_recover_only(self, processor, sample_raw_data def test_trim_to_deployment_window_no_trimming(self, processor, sample_raw_dataset): """Test trimming with no valid times.""" - deploy_time = np.datetime64('NaT') - recover_time = np.datetime64('NaT') + deploy_time = np.datetime64("NaT") + recover_time = np.datetime64("NaT") result = processor._trim_to_deployment_window( sample_raw_dataset, deploy_time, recover_time @@ -220,11 +231,13 @@ def test_trim_to_deployment_window_no_trimming(self, processor, sample_raw_datas # Check that nothing is trimmed assert len(result.time) == len(sample_raw_dataset.time) - def test_trim_to_deployment_window_empty_result(self, processor, sample_raw_dataset): + def test_trim_to_deployment_window_empty_result( + self, processor, sample_raw_dataset + ): """Test trimming that results in empty dataset.""" # Use times outside the data range - deploy_time = np.datetime64('2019-01-01T00:00:00') - recover_time = np.datetime64('2019-01-02T00:00:00') + deploy_time = np.datetime64("2019-01-01T00:00:00") + recover_time = np.datetime64("2019-01-02T00:00:00") result = processor._trim_to_deployment_window( sample_raw_dataset, deploy_time, recover_time @@ -237,45 +250,41 @@ def test_add_missing_metadata(self, processor, sample_raw_dataset): """Test adding missing metadata variables.""" # Remove some metadata to test adding it back ds = sample_raw_dataset.copy() - ds = ds.drop_vars(['InstrDepth', 'instrument', 'serial_number']) + ds = ds.drop_vars(["InstrDepth", "instrument", "serial_number"]) - instrument_config = { - 'depth': 150, - 'instrument': 'new_microcat', - 'serial': 9999 - } + instrument_config = {"depth": 150, "instrument": "new_microcat", "serial": 9999} result = processor._add_missing_metadata(ds, instrument_config) - assert result['InstrDepth'].values == 150 - assert result['instrument'].values == 'new_microcat' - assert result['serial_number'].values == 9999 + assert result["InstrDepth"].values == 150 + assert result["instrument"].values == "new_microcat" + assert result["serial_number"].values == 9999 def test_add_missing_metadata_no_overwrite(self, processor, sample_raw_dataset): """Test that existing metadata is not overwritten.""" instrument_config = { - 'depth': 999, - 'instrument': 'different_instrument', - 'serial': 8888 + "depth": 999, + "instrument": "different_instrument", + "serial": 8888, } result = processor._add_missing_metadata(sample_raw_dataset, instrument_config) # Should keep original values - assert result['InstrDepth'].values == 100 - assert result['instrument'].values == 'microcat' - assert result['serial_number'].values == 7518 + assert result["InstrDepth"].values == 100 + assert result["instrument"].values == "microcat" + assert result["serial_number"].values == 7518 def test_clean_unnecessary_variables(self, processor, sample_raw_dataset): """Test removal of unnecessary variables.""" result = processor._clean_unnecessary_variables(sample_raw_dataset) # timeS should be removed - assert 'timeS' not in result.variables + assert "timeS" not in result.variables # Other variables should remain - assert 'temperature' in result.variables - assert 'salinity' in result.variables - assert 'pressure' in result.variables + assert "temperature" in result.variables + assert "salinity" in result.variables + assert "pressure" in result.variables class TestRealDataProcessing: @@ -289,7 +298,9 @@ def test_data_setup(self, tmp_path): yaml_config_file = Path("data/test_mooring.yaml") if not raw_data_file.exists() or not yaml_config_file.exists(): - pytest.skip("Real test data files not found. Run generate_test_data.py first.") + pytest.skip( + "Real test data files not found. Run generate_test_data.py first." + ) # Set up test directory structure base_dir = tmp_path / "test_data" @@ -305,16 +316,16 @@ def test_data_setup(self, tmp_path): test_yaml_file.write_text(yaml_config_file.read_text()) return { - 'base_dir': base_dir, - 'proc_dir': proc_dir, - 'raw_file': test_raw_file, - 'yaml_file': test_yaml_file + "base_dir": base_dir, + "proc_dir": proc_dir, + "raw_file": test_raw_file, + "yaml_file": test_yaml_file, } def test_process_real_data_full_workflow(self, test_data_setup): """Test complete Stage 2 processing with real data - Version 2.0.""" setup = test_data_setup - processor = Stage2Processor(str(setup['base_dir'])) + processor = Stage2Processor(str(setup["base_dir"])) # Process the mooring result = processor.process_mooring("test_mooring") @@ -323,77 +334,79 @@ def test_process_real_data_full_workflow(self, test_data_setup): assert result is True # Check that output file was created - use_file = setup['proc_dir'] / "microcat" / "test_mooring_7518_use.nc" + use_file = setup["proc_dir"] / "microcat" / "test_mooring_7518_use.nc" assert use_file.exists() # Load and validate the processed file with xr.open_dataset(use_file) as ds: # Check basic structure - assert 'temperature' in ds.data_vars - assert 'pressure' in ds.data_vars - assert 'salinity' in ds.data_vars - assert 'time' in ds.coords + assert "temperature" in ds.data_vars + assert "pressure" in ds.data_vars + assert "salinity" in ds.data_vars + assert "time" in ds.coords # Check that clock offset was applied - assert 'clock_offset' in ds.variables - assert ds['clock_offset'].values == 300 # 5 minutes from config + assert "clock_offset" in ds.variables + assert ds["clock_offset"].values == 300 # 5 minutes from config # Check that timeS was removed - assert 'timeS' not in ds.variables + assert "timeS" not in ds.variables # Verify metadata is present - assert ds['serial_number'].values == 7518 - assert ds['instrument'].values == 'microcat' + assert ds["serial_number"].values == 7518 + assert ds["instrument"].values == "microcat" # Check that data was trimmed (should be less than original) - with xr.open_dataset(setup['raw_file']) as raw_ds: + with xr.open_dataset(setup["raw_file"]) as raw_ds: assert len(ds.time) <= len(raw_ds.time) # Time should be shifted due to clock offset if len(ds.time) > 0 and len(raw_ds.time) > 0: # First time point should be shifted by clock offset time_diff = ds.time[0].values - raw_ds.time[0].values - expected_diff = np.timedelta64(300, 's') # 5 minutes - assert abs(time_diff - expected_diff) < np.timedelta64(1, 's') + expected_diff = np.timedelta64(300, "s") # 5 minutes + assert abs(time_diff - expected_diff) < np.timedelta64(1, "s") def test_process_with_modified_times(self, test_data_setup): """Test processing with modified deployment/recovery times - Version 2.0 with correct dates.""" setup = test_data_setup # Load and modify the YAML config - with open(setup['yaml_file'], 'r') as f: + with open(setup["yaml_file"], "r") as f: config = yaml.safe_load(f) # First check what time range we actually have in the data - with xr.open_dataset(setup['raw_file']) as raw_ds: + with xr.open_dataset(setup["raw_file"]) as raw_ds: data_start = raw_ds.time.min().values data_end = raw_ds.time.max().values print(f"Raw data time range: {data_start} to {data_end}") # Set a restrictive time window within the actual data range # Data is on 2018-08-13, so use the correct date - config['deployment_time'] = '2018-08-13T08:05:00' # 5 minutes after data start - config['recovery_time'] = '2018-08-13T08:15:00' # 10 minute window + config["deployment_time"] = "2018-08-13T08:05:00" # 5 minutes after data start + config["recovery_time"] = "2018-08-13T08:15:00" # 10 minute window # Write modified config - with open(setup['yaml_file'], 'w') as f: + with open(setup["yaml_file"], "w") as f: yaml.dump(config, f) - processor = Stage2Processor(str(setup['base_dir'])) + processor = Stage2Processor(str(setup["base_dir"])) result = processor.process_mooring("test_mooring") assert result is True # Check that the processed file has limited data - use_file = setup['proc_dir'] / "microcat" / "test_mooring_7518_use.nc" + use_file = setup["proc_dir"] / "microcat" / "test_mooring_7518_use.nc" with xr.open_dataset(use_file) as ds: # Should have much less data due to restrictive time window - with xr.open_dataset(setup['raw_file']) as raw_ds: - assert len(ds.time) < len(raw_ds.time), f"Expected trimmed data, got {len(ds.time)} vs {len(raw_ds.time)}" + with xr.open_dataset(setup["raw_file"]) as raw_ds: + assert len(ds.time) < len( + raw_ds.time + ), f"Expected trimmed data, got {len(ds.time)} vs {len(raw_ds.time)}" # All data should be within the specified window (accounting for clock offset) - deploy_time = pd.to_datetime('2018-08-13T08:05:00') - recover_time = pd.to_datetime('2018-08-13T08:15:00') + deploy_time = pd.to_datetime("2018-08-13T08:05:00") + recover_time = pd.to_datetime("2018-08-13T08:15:00") assert ds.time.min() >= np.datetime64(deploy_time) assert ds.time.max() <= np.datetime64(recover_time) @@ -403,16 +416,16 @@ def test_process_missing_raw_file(self, test_data_setup): setup = test_data_setup # Remove the raw file - setup['raw_file'].unlink() + setup["raw_file"].unlink() - processor = Stage2Processor(str(setup['base_dir'])) + processor = Stage2Processor(str(setup["base_dir"])) result = processor.process_mooring("test_mooring") # Should fail gracefully assert result is False # Check log file contains warning - log_files = list(setup['proc_dir'].glob("*_stage2.mooring.log")) + log_files = list(setup["proc_dir"].glob("*_stage2.mooring.log")) assert len(log_files) == 1 log_content = log_files[0].read_text() assert "Raw file not found" in log_content @@ -432,7 +445,7 @@ def test_process_missing_config(self, tmp_path): class TestConvenienceFunctions: """Test convenience functions - Version 2.0.""" - @patch('oceanarray.stage2.Stage2Processor') + @patch("oceanarray.stage2.Stage2Processor") def test_stage2_mooring(self, mock_processor_class): """Test backwards compatibility function.""" mock_processor = Mock() @@ -445,7 +458,7 @@ def test_stage2_mooring(self, mock_processor_class): mock_processor_class.assert_called_once_with("/test/dir") mock_processor.process_mooring.assert_called_once_with("test_mooring", None) - @patch('oceanarray.stage2.Stage2Processor') + @patch("oceanarray.stage2.Stage2Processor") def test_process_multiple_moorings_stage2(self, mock_processor_class): """Test batch processing function.""" mock_processor = Mock() @@ -455,11 +468,7 @@ def test_process_multiple_moorings_stage2(self, mock_processor_class): moorings = ["mooring1", "mooring2", "mooring3"] results = process_multiple_moorings_stage2(moorings, "/test/dir") - expected = { - "mooring1": True, - "mooring2": False, - "mooring3": True - } + expected = {"mooring1": True, "mooring2": False, "mooring3": True} assert results == expected assert mock_processor.process_mooring.call_count == 3 diff --git a/tests/test_stage3.py b/tests/test_stage3.py new file mode 100644 index 0000000..06a5af1 --- /dev/null +++ b/tests/test_stage3.py @@ -0,0 +1,537 @@ +""" +Tests for oceanarray.stage3 module. + +Tests cover combining multiple instruments into single datasets with interpolation. + +Version: 1.0 +Last updated: 2025-09-07 +Changes: +- Initial test suite for Stage 3 processing +- Mock data generation for multi-instrument scenarios +- Real data integration tests +- Timing analysis and interpolation validation +""" +import pytest +import tempfile +import yaml +import numpy as np +import pandas as pd +import xarray as xr +from pathlib import Path +from unittest.mock import Mock, patch +from datetime import datetime, timedelta + +from oceanarray.stage3 import Stage3Processor, stage3_mooring, process_multiple_moorings_stage3 + + +def create_mock_instrument_dataset(start_time, end_time, interval_min, + instrument_type='microcat', serial=1234, depth=100): + """Create a mock instrument dataset for testing.""" + time_range = pd.date_range(start_time, end_time, freq=f'{interval_min}min') + + # Create realistic but synthetic data + n_points = len(time_range) + np.random.seed(serial) # Reproducible based on serial number + + data = { + 'temperature': (['time'], 15 + 5 * np.sin(np.linspace(0, 4*np.pi, n_points)) + + 0.1 * np.random.random(n_points)), + 'salinity': (['time'], 35 + 0.5 * np.cos(np.linspace(0, 3*np.pi, n_points)) + + 0.05 * np.random.random(n_points)), + 'pressure': (['time'], depth + 1 + 0.2 * np.sin(np.linspace(0, 6*np.pi, n_points)) + + 0.1 * np.random.random(n_points)), + } + + ds = xr.Dataset(data, coords={'time': time_range}) + + # Add metadata + ds.attrs['mooring_name'] = 'test_mooring' + ds['serial_number'] = serial + ds['instrument'] = instrument_type + ds['InstrDepth'] = depth + ds['clock_offset'] = 0 + + return ds + + +class TestStage3Processor: + """Test cases for Stage3Processor class.""" + + @pytest.fixture + def temp_dir(self): + """Create a temporary directory for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def processor(self, temp_dir): + """Create a Stage3Processor instance for testing.""" + return Stage3Processor(str(temp_dir)) + + @pytest.fixture + def sample_yaml_data(self): + """Sample YAML configuration data for Stage 3.""" + return { + 'name': 'test_mooring', + 'waterdepth': 1000, + 'longitude': -30.0, + 'latitude': 60.0, + 'deployment_time': '2018-08-12T08:00:00', + 'recovery_time': '2018-08-26T20:00:00', + 'instruments': [ + { + 'instrument': 'microcat', + 'serial': 7518, + 'depth': 100 + }, + { + 'instrument': 'microcat', + 'serial': 7519, + 'depth': 200 + }, + { + 'instrument': 'adcp', + 'serial': 1234, + 'depth': 300 + } + ] + } + + @pytest.fixture + def mock_datasets(self): + """Create mock datasets representing different instruments.""" + start_time = '2018-08-12T08:00:00' + end_time = '2018-08-12T12:00:00' # 4 hour window for testing + + # Different sampling rates to test interpolation + datasets = [ + create_mock_instrument_dataset(start_time, end_time, 10, 'microcat', 7518, 100), + create_mock_instrument_dataset(start_time, end_time, 5, 'microcat', 7519, 200), + create_mock_instrument_dataset(start_time, end_time, 1, 'adcp', 1234, 300), + ] + + return datasets + + def test_init(self, temp_dir): + """Test Stage3Processor initialization.""" + processor = Stage3Processor(str(temp_dir)) + assert processor.base_dir == temp_dir + assert processor.log_file is None + + def test_setup_logging(self, processor, temp_dir): + """Test logging setup.""" + mooring_name = "test_mooring" + output_path = temp_dir / "output" + output_path.mkdir() + + processor._setup_logging(mooring_name, output_path) + + assert processor.log_file is not None + assert processor.log_file.parent == output_path + assert mooring_name in processor.log_file.name + assert "stage3.mooring.log" in processor.log_file.name + + def test_ensure_instrument_metadata(self, processor): + """Test metadata addition to datasets.""" + # Create dataset missing some metadata + ds = xr.Dataset({'temperature': (['time'], [20.0, 20.1])}, + coords={'time': pd.date_range('2018-01-01', periods=2, freq='H')}) + + instrument_config = { + 'depth': 150, + 'instrument': 'test_instrument', + 'serial': 9999 + } + + result = processor._ensure_instrument_metadata(ds, instrument_config) + + assert result['InstrDepth'].values == 150 + assert result['instrument'].values == 'test_instrument' + assert result['serial_number'].values == 9999 + + def test_clean_dataset_variables(self, processor): + """Test removal of unnecessary variables.""" + # Create dataset with variables to remove + ds = xr.Dataset({ + 'temperature': (['time'], [20.0, 20.1]), + 'timeS': (['time'], [0, 1]), + 'density': (['time'], [1025.0, 1025.1]), + 'potential_temperature': (['time'], [19.9, 20.0]) + }, coords={'time': pd.date_range('2018-01-01', periods=2, freq='H')}) + + result = processor._clean_dataset_variables(ds) + + # Should keep temperature, remove others + assert 'temperature' in result.variables + assert 'timeS' not in result.variables + assert 'density' not in result.variables + assert 'potential_temperature' not in result.variables + + def test_analyze_timing_info(self, processor, mock_datasets): + """Test timing analysis across multiple datasets.""" + with patch.object(processor, '_log_print'): + time_grid, start_time, end_time = processor._analyze_timing_info(mock_datasets) + + # Check that we get a reasonable time grid + assert len(time_grid) > 0 + assert isinstance(start_time, np.datetime64) + assert isinstance(end_time, np.datetime64) + assert start_time < end_time + + # Grid should span the full time range + assert time_grid[0] >= start_time + assert time_grid[-1] <= end_time + + def test_interpolate_datasets(self, processor, mock_datasets): + """Test interpolation of datasets onto common grid.""" + # Create a simple time grid + start_time = mock_datasets[0].time.values[0] + end_time = mock_datasets[0].time.values[-1] + time_grid = np.arange(start_time, end_time, np.timedelta64(10, 'm')) + + with patch.object(processor, '_log_print'): + interpolated = processor._interpolate_datasets(mock_datasets, time_grid) + + # Should have same number of datasets + assert len(interpolated) == len(mock_datasets) + + # All should have same time coordinate + for ds in interpolated: + np.testing.assert_array_equal(ds.time.values, time_grid) + + def test_merge_global_attrs(self, processor, mock_datasets): + """Test merging of global attributes.""" + # Add different attributes to datasets + mock_datasets[0].attrs['attr1'] = 'value1' + mock_datasets[0].attrs['common'] = 'first' + mock_datasets[1].attrs['attr2'] = 'value2' + mock_datasets[1].attrs['common'] = 'second' + + merged = processor._merge_global_attrs(mock_datasets, order='last') + + assert merged['attr1'] == 'value1' + assert merged['attr2'] == 'value2' + assert merged['common'] == 'second' # Last one wins + + def test_merge_var_attrs(self, processor, mock_datasets): + """Test merging of variable attributes.""" + # Add attributes to temperature variable + mock_datasets[0]['temperature'].attrs['units'] = 'celsius' + mock_datasets[0]['temperature'].attrs['source'] = 'first' + mock_datasets[1]['temperature'].attrs['long_name'] = 'Temperature' + mock_datasets[1]['temperature'].attrs['source'] = 'second' + + merged = processor._merge_var_attrs('temperature', mock_datasets, order='last') + + assert merged['units'] == 'celsius' + assert merged['long_name'] == 'Temperature' + assert merged['source'] == 'second' # Last one wins + + def test_create_combined_dataset(self, processor, mock_datasets): + """Test creation of combined dataset.""" + # First interpolate onto common grid + start_time = mock_datasets[0].time.values[0] + end_time = mock_datasets[0].time.values[-1] + time_grid = np.arange(start_time, end_time, np.timedelta64(10, 'm')) + + with patch.object(processor, '_log_print'): + interpolated = processor._interpolate_datasets(mock_datasets, time_grid) + combined = processor._create_combined_dataset(interpolated, time_grid) + + # Check dimensions + assert 'time' in combined.dims + assert 'N_LEVELS' in combined.dims + assert combined.dims['N_LEVELS'] == len(mock_datasets) + + # Check that variables are present + expected_vars = ['temperature', 'salinity', 'pressure'] + for var in expected_vars: + assert var in combined.data_vars + assert combined[var].dims == ('time', 'N_LEVELS') + + # Check coordinate metadata + assert 'nominal_depth' in combined.coords + assert 'serial_number' in combined.coords + assert 'instrument' in combined.coords + + # Check metadata values + expected_depths = [100, 200, 300] + expected_serials = [7518, 7519, 1234] + np.testing.assert_array_equal(combined.nominal_depth.values, expected_depths) + np.testing.assert_array_equal(combined.serial_number.values, expected_serials) + + def test_encode_instrument_as_flags(self, processor): + """Test encoding of instrument names as integer flags.""" + # Create dataset with instrument coordinate + ds = xr.Dataset( + {'temperature': (['time', 'N_LEVELS'], np.random.random((10, 3)))}, + coords={ + 'time': pd.date_range('2018-01-01', periods=10, freq='H'), + 'N_LEVELS': np.arange(3), + 'instrument': ('N_LEVELS', ['microcat', 'adcp', 'microcat']) + } + ) + + result = processor._encode_instrument_as_flags(ds) + + # Check that instrument_id was created + assert 'instrument_id' in result.variables + assert 'instrument' not in result.variables # Original should be removed + + # Check flag values + assert 'flag_values' in result['instrument_id'].attrs + assert 'flag_meanings' in result['instrument_id'].attrs + + # Check global attribute + assert 'instrument_names' in result.attrs + + +class TestStage3Integration: + """Integration tests for Stage 3 processing.""" + + @pytest.fixture + def test_data_setup(self, tmp_path): + """Set up test environment with mock instrument files.""" + base_dir = tmp_path / "test_data" + proc_dir = base_dir / "moor" / "proc" / "test_mooring" + proc_dir.mkdir(parents=True) + + # Create YAML config + yaml_data = { + 'name': 'test_mooring', + 'waterdepth': 1000, + 'instruments': [ + {'instrument': 'microcat', 'serial': 7518, 'depth': 100}, + {'instrument': 'microcat', 'serial': 7519, 'depth': 200}, + ] + } + + config_file = proc_dir / "test_mooring.mooring.yaml" + with open(config_file, 'w') as f: + yaml.dump(yaml_data, f) + + # Create mock instrument files + start_time = '2018-08-12T08:00:00' + end_time = '2018-08-12T12:00:00' + + for instrument_config in yaml_data['instruments']: + instrument_type = instrument_config['instrument'] + serial = instrument_config['serial'] + depth = instrument_config['depth'] + + # Create instrument directory + inst_dir = proc_dir / instrument_type + inst_dir.mkdir(exist_ok=True) + + # Create mock dataset + interval = 10 if serial == 7518 else 5 # Different sampling rates + ds = create_mock_instrument_dataset(start_time, end_time, interval, + instrument_type, serial, depth) + + # Save as NetCDF + filename = f"test_mooring_{serial}_use.nc" + filepath = inst_dir / filename + ds.to_netcdf(filepath) + + return { + 'base_dir': base_dir, + 'proc_dir': proc_dir, + 'config_file': config_file, + 'yaml_data': yaml_data + } + + def test_full_stage3_processing(self, test_data_setup): + """Test complete Stage 3 processing workflow.""" + setup = test_data_setup + processor = Stage3Processor(str(setup['base_dir'])) + + # Process the mooring + result = processor.process_mooring("test_mooring") + + # Check that processing succeeded + assert result is True + + # Check that output file was created + output_file = setup['proc_dir'] / "test_mooring_mooring_use.nc" + assert output_file.exists() + + # Load and validate the combined dataset + with xr.open_dataset(output_file) as ds: + # Check dimensions + assert 'time' in ds.dims + assert 'N_LEVELS' in ds.dims + assert ds.dims['N_LEVELS'] == 2 # Two instruments + + # Check variables + assert 'temperature' in ds.data_vars + assert 'salinity' in ds.data_vars + assert 'pressure' in ds.data_vars + + # Check coordinate metadata + assert 'nominal_depth' in ds.coords + assert 'serial_number' in ds.coords + assert 'instrument_id' in ds.data_vars # This is a data variable, not coordinate + + # Validate metadata values + expected_depths = [100.0, 200.0] + expected_serials = [7518, 7519] + np.testing.assert_array_equal(ds.nominal_depth.values, expected_depths) + np.testing.assert_array_equal(ds.serial_number.values, expected_serials) + + def test_missing_instruments_warning(self, test_data_setup): + """Test warning when some instruments are missing.""" + setup = test_data_setup + + # Add extra instrument to YAML that doesn't have a file + yaml_data = setup['yaml_data'].copy() + yaml_data['instruments'].append({ + 'instrument': 'adcp', + 'serial': 1234, + 'depth': 300 + }) + + # Write updated config + with open(setup['config_file'], 'w') as f: + yaml.dump(yaml_data, f) + + processor = Stage3Processor(str(setup['base_dir'])) + result = processor.process_mooring("test_mooring") + + # Should still succeed with available instruments + assert result is True + + # Check log file mentions missing instrument + log_files = list(setup['proc_dir'].glob("*_stage3.mooring.log")) + assert len(log_files) == 1 + log_content = log_files[0].read_text() + assert "Missing instruments" in log_content + assert "adcp:1234" in log_content + + def test_different_sampling_rates_warning(self, test_data_setup): + """Test warnings about different sampling rates.""" + setup = test_data_setup + processor = Stage3Processor(str(setup['base_dir'])) + + # Process (datasets have 10min and 5min intervals) + result = processor.process_mooring("test_mooring") + assert result is True + + # Check log mentions timing analysis + log_files = list(setup['proc_dir'].glob("*_stage3.mooring.log")) + log_content = log_files[0].read_text() + assert "TIMING ANALYSIS" in log_content + assert "min intervals" in log_content + + def test_no_valid_datasets(self, tmp_path): + """Test handling when no valid datasets are found.""" + base_dir = tmp_path / "test_data" + proc_dir = base_dir / "moor" / "proc" / "test_mooring" + proc_dir.mkdir(parents=True) + + # Create YAML with instruments but no data files + yaml_data = { + 'name': 'test_mooring', + 'instruments': [{'instrument': 'microcat', 'serial': 9999, 'depth': 100}] + } + + config_file = proc_dir / "test_mooring.mooring.yaml" + with open(config_file, 'w') as f: + yaml.dump(yaml_data, f) + + processor = Stage3Processor(str(base_dir)) + result = processor.process_mooring("test_mooring") + + assert result is False + + def test_custom_variables_to_keep(self, test_data_setup): + """Test processing with custom variable selection.""" + setup = test_data_setup + processor = Stage3Processor(str(setup['base_dir'])) + + # Process with only temperature + result = processor.process_mooring("test_mooring", vars_to_keep=['temperature']) + assert result is True + + # Check output only has temperature + output_file = setup['proc_dir'] / "test_mooring_mooring_use.nc" + with xr.open_dataset(output_file) as ds: + assert 'temperature' in ds.data_vars + assert 'salinity' not in ds.data_vars + assert 'pressure' not in ds.data_vars + + +class TestConvenienceFunctions: + """Test convenience functions.""" + + @patch('oceanarray.stage3.Stage3Processor') + def test_stage3_mooring(self, mock_processor_class): + """Test backwards compatibility function.""" + mock_processor = Mock() + mock_processor.process_mooring.return_value = True + mock_processor_class.return_value = mock_processor + + result = stage3_mooring("test_mooring", "/test/dir", file_suffix='_use') + + assert result is True + mock_processor_class.assert_called_once_with("/test/dir") + mock_processor.process_mooring.assert_called_once_with("test_mooring", None, '_use') + + @patch('oceanarray.stage3.Stage3Processor') + def test_process_multiple_moorings_stage3(self, mock_processor_class): + """Test batch processing function.""" + mock_processor = Mock() + mock_processor.process_mooring.side_effect = [True, False, True] + mock_processor_class.return_value = mock_processor + + moorings = ["mooring1", "mooring2", "mooring3"] + results = process_multiple_moorings_stage3(moorings, "/test/dir", file_suffix='_use') + + expected = { + "mooring1": True, + "mooring2": False, + "mooring3": True + } + assert results == expected + assert mock_processor.process_mooring.call_count == 3 + + +class TestErrorHandling: + """Test error handling scenarios.""" + + def test_missing_config_file(self, tmp_path): + """Test handling of missing configuration file.""" + base_dir = tmp_path / "test_data" + proc_dir = base_dir / "moor" / "proc" / "test_mooring" + proc_dir.mkdir(parents=True) + + processor = Stage3Processor(str(base_dir)) + result = processor.process_mooring("test_mooring") + + assert result is False + + def test_invalid_yaml_file(self, tmp_path): + """Test handling of invalid YAML file.""" + base_dir = tmp_path / "test_data" + proc_dir = base_dir / "moor" / "proc" / "test_mooring" + proc_dir.mkdir(parents=True) + + # Create invalid YAML + invalid_yaml = proc_dir / "test_mooring.mooring.yaml" + invalid_yaml.write_text("invalid: yaml: content: [") + + processor = Stage3Processor(str(base_dir)) + result = processor.process_mooring("test_mooring") + + assert result is False + + def test_processing_directory_not_found(self, tmp_path): + """Test handling when processing directory doesn't exist.""" + processor = Stage3Processor(str(tmp_path)) + result = processor.process_mooring("nonexistent_mooring") + + assert result is False + + +if __name__ == "__main__": + # Version 1.0 - Initial Stage 3 test suite + pytest.main([__file__, "-v", "--tb=short"]) From 74eeaa59245b68ee1fc559b24835abd66afb7118 Mon Sep 17 00:00:00 2001 From: Eleanor Frajka-Williams Date: Sun, 7 Sep 2025 13:27:46 +0200 Subject: [PATCH 2/3] [DOC] Update to name it step1 --- docs/source/methods/time_gridding.rst | 390 +++++++++++++++--- .../{demo_stage3.ipynb => demo_step1.ipynb} | 2 +- oceanarray/{stage3.py => time_gridding.py} | 303 ++++++++++++-- .../{test_stage3.py => test_time_gridding.py} | 136 ++++-- 4 files changed, 698 insertions(+), 133 deletions(-) rename notebooks/{demo_stage3.ipynb => demo_step1.ipynb} (98%) rename oceanarray/{stage3.py => time_gridding.py} (63%) rename tests/{test_stage3.py => test_time_gridding.py} (76%) diff --git a/docs/source/methods/time_gridding.rst b/docs/source/methods/time_gridding.rst index 1637c46..8eb92b9 100644 --- a/docs/source/methods/time_gridding.rst +++ b/docs/source/methods/time_gridding.rst @@ -1,101 +1,375 @@ -Gridding in time -================ +Step 1: Time Gridding and Optional Filtering +============================================ -Individual records may be sampled at a much higher rate than needed for a specific purpose. When concatenating multiple records together, it may be useful for some applications to filter out high-frequency variability, such as tidal signals, and subsample the time series to a lower frequency. This describes **filtering** as used in the RAPID array to remove tides and subsample to half-daily (12-hourly) intervals, which makes the dataset more management after 20 years of records. +This document describes **Step 1** in the mooring-level processing workflow: combining multiple instruments onto a common time grid with optional time-domain filtering. This step consolidates individual instrument time series from different sampling rates into a unified mooring dataset suitable for further analysis. -This step is performed after standardisation and trimming, and prior to vertical interpolation and transport calculation. +This represents the first step in mooring-level processing: + +- **Step 1**: Time gridding (this document) +- **Step 2**: Vertical gridding (future) +- **Step 3**: Multi-deployment stitching (future) + +This step is performed after individual instrument processing (Stages 1-3: standardisation, trimming, QC/calibration) and prior to vertical interpolation and transport calculations. + +**CRITICAL**: Any filtering is applied to individual instrument records **BEFORE** interpolation onto the common time grid since normally we are low pass filtering and downsampling. Downsampling first could alias high frequency variability into the filtered dataset. 1. Overview ----------- -High-frequency variability, such as tidal signals, is removed from the raw time series via low-pass filtering. This step enhances the comparability between instruments and suppresses aliasing due to undersampled high-frequency processes. +Multiple instruments on a mooring often sample at different rates, creating challenges for comparative analysis and further processing. Time gridding addresses this by: -In the RAPID framework (see `hydro_grid.m`), a **2-day low-pass Butterworth filter** (6th order) is applied to temperature, salinity, and pressure time series. This is a standard approach used for moored time series processing, ensuring consistency across instruments and deployments. +- Optionally applying time-domain filtering to individual instrument records +- Interpolating all instruments onto a common temporal grid +- Preserving high-frequency temporal resolution when desired +- Creating unified mooring datasets with standardized structure + +The process combines individual instrument files (`*_use.nc`) into single mooring files (`*_mooring_use.nc`) with an N_LEVELS dimension representing the different instruments/depths. 2. Purpose ---------- -- Suppress tidal and inertial variability -- Prepare data for interpolation and averaging -- Enable meaningful anomaly detection and gridding +- **Optional filtering**: Apply lowpass filters to remove high frequency variability +- **Common time grid**: Align instruments with different sampling rates onto a single time vector +- **Data consolidation**: Create single files representing entire moorings +- **Preserve resolution** (optional): Maintain temporal detail for high-frequency analysis +- **Standardized structure**: Enable consistent downstream processing -3. Input --------- +3. Processing Workflow +---------------------- -- Trimmed `xarray.Dataset` with variables like `TEMPERATURE`, `SALINITY`, `PRESSURE`, and `TIME` -- Sampling frequency (or timestamp resolution) +The correct processing order is essential for data integrity: -4. Output ---------- +1. **Load instrument datasets**: Read all available `*_use.nc` files for a mooring +2. **Apply filtering to individual datasets**: Filter each instrument on its native time grid +3. **Timing analysis**: Analyze sampling rates and detect temporal coverage +4. **Common grid creation**: Calculate median sampling interval across all instruments. **Note:** may be worthwhile to consider other options here in future. +5. **Interpolation**: Interpolate filtered datasets onto the common temporal grid +6. **Dataset combination**: Merge into single dataset with N_LEVELS dimension +7. **Metadata encoding**: Convert string variables to CF-compliant integer flags +8. **NetCDF output**: Write combined mooring dataset -- Low-pass filtered `xarray.Dataset` with the same variables -- Optionally: filtered values as separate variables (e.g., `TEMPERATURE_LP`) -- Metadata note: `filtered = "2-day Butterworth, 6th order"` -5. Method ---------- +Implemented Filtering Types +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- The filter cutoff is typically 2 days (i.e., ~0.0058 Hz) -- Filter type: **Butterworth**, **6th order** -- Apply the filter to each variable independently -- Optionally interpolate across short gaps (e.g., <10 days) -- Gaps larger than a specified duration are marked as NaN post-filtering +**Low-pass Butterworth Filter (RAPID-style)** +- **Purpose**: Remove tidal and inertial variability for long-term analysis +- **Default parameters**: 2-day cutoff, 6th order Butterworth +- **Applications**: Climate studies, transport calculations, data volume reduction +- **Gap handling**: Filters continuous segments separately -6. Example ----------- + +Consider for the future (especially for bottom pressure?): + +**Harmonic De-tiding (Future??)** +- **Purpose**: Remove specific tidal constituents using harmonic analysis +- **Status**: Placeholder - currently falls back to low-pass filtering +- **Applications**: Precise tidal removal, retained sub-tidal variability + +4. Current Implementation +------------------------- + +The time gridding process is implemented in the :mod:`oceanarray.time_gridding` module, which provides automated processing for mooring datasets. + +4.1. Input Requirements +^^^^^^^^^^^^^^^^^^^^^^^ + +The :class:`oceanarray.time_gridding.TimeGriddingProcessor` class processes Stage 1, 2 or 3 output files: + +- **Individual instrument files** (`*_use.nc`): Trimmed and clock-corrected time series. It can also be applied to `*_qc.nc`. +- **YAML configuration**: Mooring metadata with instrument specifications +- **Multiple sampling rates**: Automatic detection and handling of different temporal resolutions + +4.2. Processing Workflow Implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`TimeGriddingProcessor.process_mooring` method follows these steps: + +1. **Load instrument datasets**: Read all available `*_use.nc` files for a mooring +2. **Apply individual filtering**: Use :meth:`_apply_time_filtering_single` on each dataset +3. **Timing analysis**: Analyze sampling rates using :meth:`_analyze_timing_info` +4. **Common grid creation**: Calculate median sampling interval across filtered instruments +5. **Interpolation**: Use :meth:`_interpolate_datasets` on filtered data +6. **Dataset combination**: Merge using :meth:`_create_combined_dataset` +7. **Metadata encoding**: Apply :meth:`_encode_instrument_as_flags` +8. **NetCDF output**: Write combined mooring dataset + +4.3. Filtering Implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Individual Dataset Filtering** + +The :meth:`_apply_time_filtering_single` method processes each instrument separately: + +.. code-block:: python + + def _apply_time_filtering_single(self, dataset, filter_type, filter_params): + """Apply filtering to individual instrument on native time grid.""" + if filter_type == 'lowpass': + return self._apply_lowpass_filter(dataset, filter_params) + elif filter_type == 'detide': + return self._apply_detiding_filter(dataset, filter_params) + else: + return dataset # No filtering + +**Low-pass Butterworth Filter** + +The :meth:`_apply_lowpass_filter` method implements RAPID-style filtering: + +- **Frequency analysis**: Validates cutoff frequency against Nyquist limit +- **Filter design**: 6th order Butterworth low-pass filter using scipy.signal +- **Gap handling**: Processes continuous segments separately via :meth:`_filter_with_gaps` +- **Quality control**: Checks data length and validity before filtering +- **Robust processing**: Graceful fallbacks when filtering fails + +**Filter Parameters** .. code-block:: python - from oceanarray.methods.filtering import apply_lowpass_filter + filter_params = { + 'cutoff_days': 2.0, # Cutoff frequency in days + 'order': 6, # Filter order + 'method': 'butterworth' # Filter type + } + +4.4. Timing Analysis and Warnings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The processor provides comprehensive timing analysis: + +- **Sampling rate detection**: Identifies median intervals for each instrument +- **Interpolation warnings**: Alerts when large sampling rate differences exist (>2x) +- **Missing instrument alerts**: Compares loaded files against YAML configuration +- **Irregular sampling detection**: Flags instruments with >10% timing variability +- **Filter impact assessment**: Reports changes to original sampling rates + +4.5. Configuration Example +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Time gridding uses existing YAML mooring configurations: + +.. code-block:: yaml + + name: mooring_name + instruments: + - instrument: microcat + serial: 7518 + depth: 100 + - instrument: adcp + serial: 1234 + depth: 300 + +4.6. Usage Examples +^^^^^^^^^^^^^^^^^^^ + +**Basic Processing (No Filtering)** + +.. code-block:: python + + from oceanarray.time_gridding import process_multiple_moorings_time_gridding + + # Process moorings without filtering + moorings = ['mooring1', 'mooring2'] + results = process_multiple_moorings_time_gridding(moorings, basedir) + +**RAPID-style De-tiding** + +.. code-block:: python - ds_filtered = apply_lowpass_filter(ds_trimmed, cutoff_days=2.0, order=6) + # Apply 2-day low-pass filter (RAPID-style) + results = process_multiple_moorings_time_gridding( + moorings, basedir, + filter_type='lowpass', + filter_params={'cutoff_days': 2.0, 'order': 6} + ) -.. code-block:: text +**Custom Filter Parameters** + +.. code-block:: python + + # Custom filter settings + results = process_multiple_moorings_time_gridding( + moorings, basedir, + filter_type='lowpass', + filter_params={'cutoff_days': 1.0, 'order': 4} + ) + +5. Output Format +---------------- + +The time-gridded output includes: + +- **Combined mooring dataset** (`*_mooring_use.nc`) with: + - Time coordinates common to all instruments + - N_LEVELS dimension representing instrument/depth levels + - Variables stacked across instruments with NaN for missing data + - Comprehensive metadata preservation + - Filter provenance when filtering applied + +- **Processing logs** with detailed information about: + - Filtering decisions and parameters + - Timing analysis and interpolation decisions + - Missing instruments and sampling rate warnings + - Processing success/failure status + +**Example output structure:** + +.. code-block:: python - Dimensions: (TIME: 104576) + Dimensions: (time: 8640, N_LEVELS: 3) Coordinates: - * TIME (TIME) datetime64[ns] ... + * time (time) datetime64[ns] 2018-08-12T08:00:00 ... 2018-08-26T20:00:00 + * N_LEVELS (N_LEVELS) int64 0 1 2 + nominal_depth (N_LEVELS) float32 100.0 200.0 300.0 + serial_number (N_LEVELS) int64 7518 7519 1234 + clock_offset (N_LEVELS) int64 0 300 -120 Data variables: - TEMPERATURE (TIME) float32 ... - PRESSURE (TIME) float32 ... - SALINITY (TIME) float32 ... + temperature (time, N_LEVELS) float32 ... + salinity (time, N_LEVELS) float32 ... + pressure (time, N_LEVELS) float32 ... + instrument_id (N_LEVELS) int16 1 1 2 Attributes: - filtered: "2-day Butterworth, 6th order" + mooring_name: test_mooring + instrument_names: microcat, adcp + time_filtering_applied: {'cutoff_days': 2.0, 'order': 6} # If filtered -7. FAIR Considerations ----------------------- +6. Quality Control and Processing Intelligence +---------------------------------------------- -- Clearly document filter type and cutoff frequency -- Retain unfiltered data for traceability and reproducibility -- Enable filtering settings to be parameterised in software +The time gridding processor includes several quality control features: -Legacy Context (RAPID) ----------------------- +- **Temporal coverage analysis**: Identifies gaps and overlaps in instrument records +- **Sampling rate optimization**: Uses median interval to minimize interpolation artifacts +- **Missing data handling**: Preserves NaN values and missing instrument periods +- **Filter validation**: Checks filter parameters against data characteristics +- **Interpolation impact assessment**: Quantifies changes to original sampling rates +- **Comprehensive logging**: Detailed processing logs for debugging and validation + +7. Time-Domain Filtering Details +================================= + +7.1. Filtering Applications +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Time-domain filtering is particularly useful for: + +- **Long-term climate studies**: Removing tidal signals for multi-year analysis +- **Transport calculations**: Focusing on sub-inertial variability +- **Data volume reduction**: Subsampling to lower frequencies for storage efficiency +- **Spectral analysis preparation**: Removing specific frequency bands + +But it is not necessarily appropriate for: + +- **High-frequency process studies**: Where tidal and inertial signals are of interest +- **Short-term deployments**: Where filtering may remove significant portions of the record + +7.2. RAPID Array Context: De-tiding for Long-term Records +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The filtering implementation is based on the RAPID array processing workflow, where **2-day low-pass Butterworth filtering** (6th order) was applied to remove tidal and inertial variability from year-long mooring records. + +**RAPID filtering characteristics:** +- **Purpose**: Remove tides from hourly-sampled, year-long records +- **Filter type**: Butterworth, 6th order +- **Cutoff frequency**: 2 days (~0.0058 Hz) +- **Application**: Temperature, salinity, and pressure time series +- **Output frequency**: Often subsampled to 12-hourly intervals +- **Gap handling**: Interpolation across gaps <10 days + +**Historical RAPID workflow:** + +.. code-block:: matlab + + % MATLAB implementation (hydro_grid.m) + filtered_temp = auto_filt(temperature, sample_rate, cutoff_days); + +This approach was essential for RAPID's 20-year dataset management, converting high-frequency hourly data to manageable half-daily records suitable for transport calculations and long-term climate analysis. + +**Modern improvements:** + +The Python implementation in :mod:`oceanarray.time_gridding` provides equivalent functionality with: + +- **Multi-instrument handling**: Process entire moorings simultaneously +- **Flexible filtering**: Multiple filter types and parameters +- **Quality control**: Comprehensive timing analysis and warnings +- **Modern formats**: NetCDF output with CF conventions +- **Gap-aware processing**: Intelligent handling of data gaps + +7.3. Filter Implementation Details +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Gap Handling** + +The :meth:`_filter_with_gaps` method processes data with missing values: + +- **Segment identification**: Finds continuous data segments +- **Minimum length**: Only filters segments >50 points for stability +- **Separate processing**: Filters each segment independently +- **Graceful fallbacks**: Preserves original data if filtering fails + +**Quality Validation** + +Before applying filters, the system validates: + +- **Data length**: Minimum 100 points required for stable filtering +- **Sampling rate**: Must be regular and well-defined +- **Nyquist criterion**: Cutoff frequency must be below Nyquist limit +- **Data quality**: Sufficient valid (non-NaN) data points + +**Filter Variables** + +The following variables are filtered when present: + +- Temperature, salinity, conductivity +- Pressure and derived quantities +- Velocity components (u, v, eastward, northward) + +Coordinate variables and metadata are preserved unchanged. + +8. Integration with Processing Chain +------------------------------------ + +Time-gridded files serve as input to subsequent mooring-level processing: -The MATLAB script `hydro_grid.m` from the RAPID processing workflow performs a 2-day Butterworth filter using `auto_filt.m`_. The filtered output is used to grid and interpolate hydrographic time series before combining into overturning transport calculations. +- **Step 2**: Vertical gridding onto common pressure levels +- **Step 3**: Multi-deployment temporal stitching +- **Analysis workflows**: Transport calculations, climatological analysis -.. _auto_filt.m: ../_static/code/auto_filt.m +The consistent structure and temporal alignment created during time gridding enables efficient downstream processing across different instrument configurations. -This process includes: +**Processing provenance** is maintained through: -- Temporal alignment to common grid -- Conversion from conductivity to salinity -- Gap-filling for small gaps (<10 days) -- Handling of missing sensors via reference levels or climatology +- Global attributes recording filter parameters +- Processing logs with detailed decision information +- Preserved original metadata where possible +- Clear documentation of interpolation and filtering steps -See also: :doc:`gridding` +9. Implementation Notes +----------------------- +- **Interpolation method**: Linear interpolation via xarray.Dataset.interp() +- **Time handling**: All times processed as UTC datetime64 objects +- **Memory efficiency**: Chunked NetCDF output for large datasets +- **Attribute preservation**: Global and variable attributes maintained through processing +- **Missing data**: NaN values preserved and propagated appropriately +- **Filter dependencies**: Requires scipy for Butterworth filter implementation -- `hydro_grid.m <../_static/code/hydro_grid.m>`__ +10. FAIR Considerations +----------------------- -This script was part of the original RAPID processing chain used to convert MicroCAT `.asc` or `.cnv` output to `.raw` files in RODB format. It includes parsing logic, metadata extraction from `info.dat`, time adjustment, and basic diagnostic plotting. +- **Findable**: Standardized file naming and comprehensive metadata +- **Accessible**: NetCDF format with CF conventions for broad compatibility +- **Interoperable**: Consistent structure across moorings and deployments +- **Reusable**: Detailed processing logs and parameter documentation +Time gridding decisions, interpolation details, and filtering parameters are documented transparently in processing logs and dataset attributes to maintain full provenance. +**Filter provenance includes:** -.. literalinclude:: ../_static/code/hydro_grid.m - :language: matlab - :lines: 224-286 - :linenos: - :caption: Excerpt from `hydro_grid.m` +- Filter type and parameters in global attributes +- Original sampling rates and interpolation changes +- Gap locations and filter segment boundaries +- Quality control decisions and warnings +See also: :doc:`../oceanarray`, :doc:`trimming`, :doc:`vertical_gridding`, :doc:`stitching` diff --git a/notebooks/demo_stage3.ipynb b/notebooks/demo_step1.ipynb similarity index 98% rename from notebooks/demo_stage3.ipynb rename to notebooks/demo_step1.ipynb index 3f717ca..eda62d1 100644 --- a/notebooks/demo_stage3.ipynb +++ b/notebooks/demo_step1.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "from oceanarray.stage3 import Stage3Processor, process_multiple_moorings_stage3\n", + "from oceanarray.time_gridding import Stage3Processor, process_multiple_moorings_stage3\n", "\n", "# Simple usage\n", "moorlist = ['dsE_1_2018']\n", diff --git a/oceanarray/stage3.py b/oceanarray/time_gridding.py similarity index 63% rename from oceanarray/stage3.py rename to oceanarray/time_gridding.py index e8883cc..5a70a4d 100644 --- a/oceanarray/stage3.py +++ b/oceanarray/time_gridding.py @@ -1,14 +1,23 @@ """ -Stage 3 processing for mooring data: Combine individual instruments into one xarray dataset. +Step 1 processing for mooring data: Time gridding and optional filtering. This module handles: -- Loading processed Stage 2 NetCDF files (_use.nc) +- Loading processed Stage 2 NetCDF files (_use.nc) from multiple instruments +- Optional time-domain filtering applied to individual instrument records - Interpolating all instruments onto a common time grid - Combining instruments into a single dataset with N_LEVELS dimension - Encoding instrument metadata as coordinate arrays -- Writing combined mooring datasets +- Writing time-gridded mooring datasets -Version: 1.0 +This represents Step 1 in the mooring-level processing workflow: +- Step 1: Time gridding (this module) +- Step 2: Vertical gridding (future) +- Step 3: Multi-deployment stitching (future) + +IMPORTANT: Filtering is applied to individual instrument records BEFORE interpolation +to preserve data integrity and avoid interpolation artifacts. + +Version: 1.1 Last updated: 2025-09-07 """ import os @@ -22,8 +31,8 @@ from ctd_tools.writers import NetCdfWriter -class Stage3Processor: - """Handles Stage 3 processing: combining instruments into single dataset.""" +class TimeGriddingProcessor: + """Handles Step 1 processing: time gridding and optional filtering of mooring instruments.""" def __init__(self, base_dir: str): """Initialize processor with base directory.""" @@ -33,7 +42,7 @@ def __init__(self, base_dir: str): def _setup_logging(self, mooring_name: str, output_path: Path) -> None: """Set up logging for the processing run.""" log_time = datetime.now().strftime('%Y%m%dT%H') - self.log_file = output_path / f"{mooring_name}_{log_time}_stage3.mooring.log" + self.log_file = output_path / f"{mooring_name}_{log_time}_time_gridding.mooring.log" def _log_print(self, *args, **kwargs) -> None: """Print to both console and log file.""" @@ -125,6 +134,188 @@ def _clean_dataset_variables(self, dataset: xr.Dataset) -> xr.Dataset: return dataset + def _apply_time_filtering_single(self, dataset: xr.Dataset, + filter_type: Optional[str] = None, + filter_params: Optional[Dict[str, Any]] = None) -> xr.Dataset: + """ + Apply time-domain filtering to a single instrument dataset. + + This method applies filtering to the original instrument time grid + BEFORE interpolation to preserve data integrity. + + Args: + dataset: Single instrument dataset on its native time grid + filter_type: Type of filtering ('lowpass', 'bandpass', 'detide', etc.) + filter_params: Filter parameters (cutoff frequencies, order, etc.) + + Returns: + Filtered dataset on the same time grid + """ + if filter_type is None: + # No filtering requested + return dataset + + # Get instrument info for logging + instrument = dataset['instrument'].values if 'instrument' in dataset else 'unknown' + serial = dataset['serial_number'].values if 'serial_number' in dataset else 'unknown' + depth = dataset['InstrDepth'].values if 'InstrDepth' in dataset else 'unknown' + + self._log_print(f" Applying {filter_type} filtering to {instrument}:{serial} at {depth}m") + + if filter_type.lower() == 'lowpass': + return self._apply_lowpass_filter(dataset, filter_params) + elif filter_type.lower() == 'detide': + return self._apply_detiding_filter(dataset, filter_params) + elif filter_type.lower() == 'bandpass': + return self._apply_bandpass_filter(dataset, filter_params) + else: + self._log_print(f" WARNING: Unknown filter type '{filter_type}', skipping") + return dataset + + def _apply_lowpass_filter(self, dataset: xr.Dataset, + filter_params: Optional[Dict[str, Any]] = None) -> xr.Dataset: + """ + Apply low-pass Butterworth filter (e.g., for de-tiding). + + Default parameters match RAPID array processing: + - Cutoff: 2 days (removes tidal and inertial signals) + - Order: 6th order Butterworth + """ + # Default RAPID-style parameters + default_params = { + 'cutoff_days': 2.0, + 'order': 6, + 'method': 'butterworth' + } + + if filter_params: + default_params.update(filter_params) + + cutoff_days = default_params['cutoff_days'] + order = default_params['order'] + + self._log_print(f" Low-pass filter: {cutoff_days} day cutoff, order {order}") + + # Check if we have sufficient data length + if 'time' not in dataset.sizes or dataset.sizes['time'] < 100: + self._log_print(f" WARNING: Insufficient data for filtering (n={dataset.sizes.get('time', 0)})") + return dataset + + # Calculate sampling rate + time_diffs = np.diff(dataset['time'].values) / np.timedelta64(1, 's') + dt_seconds = np.nanmedian(time_diffs) + + if not np.isfinite(dt_seconds) or dt_seconds <= 0: + self._log_print(f" WARNING: Invalid sampling rate, skipping filter") + return dataset + + # Convert cutoff to frequency + cutoff_seconds = cutoff_days * 24 * 3600 + nyquist = 1.0 / (2.0 * dt_seconds) + cutoff_freq = 1.0 / cutoff_seconds + + if cutoff_freq >= nyquist: + self._log_print(f" WARNING: Cutoff frequency ({cutoff_freq:.2e} Hz) >= Nyquist ({nyquist:.2e} Hz)") + self._log_print(f" Skipping filter to avoid artifacts") + return dataset + + # Apply filter to each data variable + try: + from scipy import signal + + # Design Butterworth filter + sos = signal.butter(order, cutoff_freq, btype='low', fs=1/dt_seconds, output='sos') + + # Create filtered dataset + ds_filtered = dataset.copy() + + # Variables to filter (skip coordinates and metadata) + filter_vars = ['temperature', 'salinity', 'conductivity', 'pressure', + 'u_velocity', 'v_velocity', 'eastward_velocity', 'northward_velocity'] + + for var in filter_vars: + if var in dataset.data_vars: + data = dataset[var].values + + # Check for sufficient valid data + valid_mask = np.isfinite(data) + if np.sum(valid_mask) < 0.1 * len(data): + self._log_print(f" WARNING: Too few valid points in {var}, skipping") + continue + + # Apply filter only to valid data segments + if np.all(valid_mask): + # No gaps, apply filter directly + filtered_data = signal.sosfiltfilt(sos, data) + else: + # Handle gaps by filtering continuous segments + filtered_data = self._filter_with_gaps(data, sos, valid_mask) + + ds_filtered[var] = (dataset[var].dims, filtered_data, dataset[var].attrs) + + # Update attributes to record filtering + ds_filtered.attrs.update({ + 'time_filtering_applied': filter_params or default_params, + 'time_filtering_cutoff_days': cutoff_days, + 'time_filtering_order': order, + 'processing_step': 'time_filtered' + }) + + self._log_print(f" Successfully applied low-pass filter") + return ds_filtered + + except ImportError: + self._log_print(f" ERROR: scipy not available for filtering") + return dataset + except Exception as e: + self._log_print(f" ERROR applying filter: {e}") + return dataset + + def _filter_with_gaps(self, data: np.ndarray, sos: np.ndarray, + valid_mask: np.ndarray) -> np.ndarray: + """Apply filter to data with gaps by processing continuous segments.""" + from scipy import signal + + filtered_data = np.full_like(data, np.nan) + + # Find continuous segments + diff_mask = np.diff(np.concatenate(([False], valid_mask, [False])).astype(int)) + starts = np.where(diff_mask == 1)[0] + ends = np.where(diff_mask == -1)[0] + + for start, end in zip(starts, ends): + segment_length = end - start + + # Only filter segments with sufficient length + if segment_length > 50: # Minimum length for stable filtering + segment_data = data[start:end] + try: + filtered_segment = signal.sosfiltfilt(sos, segment_data) + filtered_data[start:end] = filtered_segment + except: + # If filtering fails, keep original data + filtered_data[start:end] = segment_data + else: + # Keep short segments unfiltered + filtered_data[start:end] = data[start:end] + + return filtered_data + + def _apply_detiding_filter(self, dataset: xr.Dataset, + filter_params: Optional[Dict[str, Any]] = None) -> xr.Dataset: + """Apply harmonic analysis for tidal removal (future implementation).""" + self._log_print(f" WARNING: Harmonic de-tiding not yet implemented") + self._log_print(f" Using low-pass filter as substitute") + + # Fall back to low-pass filtering for now + return self._apply_lowpass_filter(dataset, filter_params) + + def _apply_bandpass_filter(self, dataset: xr.Dataset, + filter_params: Optional[Dict[str, Any]] = None) -> xr.Dataset: + """Apply band-pass filter (future implementation).""" + self._log_print(f" WARNING: Band-pass filtering not yet implemented") + return dataset + def _analyze_timing_info(self, datasets: List[xr.Dataset]) -> Tuple[np.ndarray, np.datetime64, np.datetime64]: """Analyze timing information across all datasets.""" intervals_min = [] @@ -434,15 +625,19 @@ def _get_netcdf_writer_params(self) -> Dict[str, Any]: def process_mooring(self, mooring_name: str, output_path: Optional[str] = None, file_suffix: str = '_use', - vars_to_keep: List[str] = None) -> bool: + vars_to_keep: List[str] = None, + filter_type: Optional[str] = None, + filter_params: Optional[Dict[str, Any]] = None) -> bool: """ - Process Stage 3 for a single mooring: combine instruments into single dataset. + Process Step 1 for a single mooring: time gridding and optional filtering. Args: mooring_name: Name of the mooring to process output_path: Optional custom output path file_suffix: Suffix for input files ('_use' or '_raw') vars_to_keep: List of variables to include in combined dataset + filter_type: Type of time filtering to apply ('lowpass', 'detide', 'bandpass') + filter_params: Parameters for filtering Returns: bool: True if processing completed successfully @@ -459,9 +654,14 @@ def process_mooring(self, mooring_name: str, # Set up logging self._setup_logging(mooring_name, proc_dir) - self._log_print(f"Starting Stage 3 processing for mooring: {mooring_name}") + self._log_print(f"Starting Step 1 (time gridding) processing for mooring: {mooring_name}") self._log_print(f"Using files with suffix: {file_suffix}") + if filter_type: + self._log_print(f"Filtering requested: {filter_type}") + if filter_params: + self._log_print(f"Filter parameters: {filter_params}") + # Load configuration config_file = proc_dir / f"{mooring_name}.mooring.yaml" if not config_file.exists(): @@ -484,83 +684,109 @@ def process_mooring(self, mooring_name: str, self._log_print(f"Loaded {len(datasets)} instrument datasets") try: - # Analyze timing and create common grid - time_grid, start_time, end_time = self._analyze_timing_info(datasets) + # STEP 1: Apply filtering to individual instrument records (BEFORE interpolation) + if filter_type: + self._log_print(f"") + self._log_print(f"APPLYING TIME FILTERING TO INDIVIDUAL INSTRUMENTS:") + datasets_filtered = [] + for ds in datasets: + ds_filtered = self._apply_time_filtering_single(ds, filter_type, filter_params) + datasets_filtered.append(ds_filtered) + self._log_print(f"Completed filtering for all instruments") + else: + datasets_filtered = datasets - # Interpolate datasets onto common grid - datasets_interp = self._interpolate_datasets(datasets, time_grid) + # STEP 2: Analyze timing and create common grid + time_grid, start_time, end_time = self._analyze_timing_info(datasets_filtered) + + # STEP 3: Interpolate filtered datasets onto common grid + self._log_print(f"INTERPOLATING FILTERED DATASETS ONTO COMMON GRID:") + datasets_interp = self._interpolate_datasets(datasets_filtered, time_grid) if not datasets_interp: self._log_print(f"ERROR: No datasets could be interpolated") return False - # Combine into single dataset + # STEP 4: Combine into single dataset combined_ds = self._create_combined_dataset(datasets_interp, time_grid, vars_to_keep) - # Encode instrument names as flags + # STEP 5: Encode instrument names as flags ds_to_save = self._encode_instrument_as_flags(combined_ds) # Write output file - output_filename = f"{mooring_name}_mooring{file_suffix}.nc" + filter_suffix = f"_{filter_type}" if filter_type else "" + output_filename = f"{mooring_name}_mooring{file_suffix}{filter_suffix}.nc" output_filepath = proc_dir / output_filename writer = NetCdfWriter(ds_to_save) writer_params = self._get_netcdf_writer_params() writer.write(str(output_filepath), **writer_params) - self._log_print(f"Successfully wrote combined dataset: {output_filepath}") + self._log_print(f"Successfully wrote time-gridded dataset: {output_filepath}") self._log_print(f"Combined dataset shape: {dict(ds_to_save.dims)}") self._log_print(f"Variables: {list(ds_to_save.data_vars)}") return True except Exception as e: - self._log_print(f"ERROR during Stage 3 processing: {e}") + self._log_print(f"ERROR during time gridding processing: {e}") return False -def stage3_mooring(mooring_name: str, basedir: str, - output_path: Optional[str] = None, - file_suffix: str = '_use') -> bool: +def time_gridding_mooring(mooring_name: str, basedir: str, + output_path: Optional[str] = None, + file_suffix: str = '_use', + filter_type: Optional[str] = None, + filter_params: Optional[Dict[str, Any]] = None) -> bool: """ - Process Stage 3 for a single mooring (backwards compatibility function). + Process Step 1 for a single mooring (convenience function). Args: mooring_name: Name of the mooring to process basedir: Base directory containing the data output_path: Optional output path override file_suffix: Suffix for input files ('_use' or '_raw') + filter_type: Optional time filtering to apply ('lowpass', 'detide', 'bandpass') + filter_params: Optional parameters for filtering Returns: bool: True if processing completed successfully """ - processor = Stage3Processor(basedir) - return processor.process_mooring(mooring_name, output_path, file_suffix) + processor = TimeGriddingProcessor(basedir) + return processor.process_mooring(mooring_name, output_path, file_suffix, + filter_type=filter_type, filter_params=filter_params) -def process_multiple_moorings_stage3(mooring_list: List[str], - basedir: str, - file_suffix: str = '_use') -> Dict[str, bool]: +def process_multiple_moorings_time_gridding(mooring_list: List[str], + basedir: str, + file_suffix: str = '_use', + filter_type: Optional[str] = None, + filter_params: Optional[Dict[str, Any]] = None) -> Dict[str, bool]: """ - Process Stage 3 for multiple moorings. + Process Step 1 for multiple moorings. Args: mooring_list: List of mooring names to process basedir: Base directory containing the data file_suffix: Suffix for input files ('_use' or '_raw') + filter_type: Optional time filtering to apply ('lowpass', 'detide', 'bandpass') + filter_params: Optional parameters for filtering Returns: Dict mapping mooring names to success status """ - processor = Stage3Processor(basedir) + processor = TimeGriddingProcessor(basedir) results = {} for mooring_name in mooring_list: print(f"\n{'='*50}") - print(f"Processing Stage 3 for mooring {mooring_name}") + print(f"Processing Step 1 (time gridding) for mooring {mooring_name}") print(f"{'='*50}") - results[mooring_name] = processor.process_mooring(mooring_name, file_suffix=file_suffix) + results[mooring_name] = processor.process_mooring(mooring_name, + file_suffix=file_suffix, + filter_type=filter_type, + filter_params=filter_params) return results @@ -572,12 +798,19 @@ def process_multiple_moorings_stage3(mooring_list: List[str], basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/' - # Process all moorings - results = process_multiple_moorings_stage3(moorlist, basedir) + # Process all moorings without filtering + results = process_multiple_moorings_time_gridding(moorlist, basedir) + + # Example: Process with low-pass filtering (RAPID-style de-tiding) + # results = process_multiple_moorings_time_gridding( + # moorlist, basedir, + # filter_type='lowpass', + # filter_params={'cutoff_days': 2.0, 'order': 6} + # ) # Print summary print(f"\n{'='*50}") - print("STAGE 3 PROCESSING SUMMARY") + print("STEP 1 (TIME GRIDDING) PROCESSING SUMMARY") print(f"{'='*50}") for mooring, success in results.items(): status = "SUCCESS" if success else "FAILED" diff --git a/tests/test_stage3.py b/tests/test_time_gridding.py similarity index 76% rename from tests/test_stage3.py rename to tests/test_time_gridding.py index 06a5af1..5ed1bf9 100644 --- a/tests/test_stage3.py +++ b/tests/test_time_gridding.py @@ -1,15 +1,14 @@ """ -Tests for oceanarray.stage3 module. +Tests for oceanarray.time_gridding module. -Tests cover combining multiple instruments into single datasets with interpolation. +Tests cover Step 1: time gridding and optional filtering of multiple instruments. -Version: 1.0 +Version: 1.1 Last updated: 2025-09-07 Changes: -- Initial test suite for Stage 3 processing -- Mock data generation for multi-instrument scenarios -- Real data integration tests -- Timing analysis and interpolation validation +- Fixed test_apply_time_filtering_single method indentation and placement +- Fixed test_filtering_parameter_placeholder to work with new structure +- Removed bandpass filter references """ import pytest import tempfile @@ -21,7 +20,7 @@ from unittest.mock import Mock, patch from datetime import datetime, timedelta -from oceanarray.stage3 import Stage3Processor, stage3_mooring, process_multiple_moorings_stage3 +from oceanarray.time_gridding import TimeGriddingProcessor, time_gridding_mooring, process_multiple_moorings_time_gridding def create_mock_instrument_dataset(start_time, end_time, interval_min, @@ -54,8 +53,8 @@ def create_mock_instrument_dataset(start_time, end_time, interval_min, return ds -class TestStage3Processor: - """Test cases for Stage3Processor class.""" +class TestTimeGriddingProcessor: + """Test cases for TimeGriddingProcessor class.""" @pytest.fixture def temp_dir(self): @@ -65,12 +64,12 @@ def temp_dir(self): @pytest.fixture def processor(self, temp_dir): - """Create a Stage3Processor instance for testing.""" - return Stage3Processor(str(temp_dir)) + """Create a TimeGriddingProcessor instance for testing.""" + return TimeGriddingProcessor(str(temp_dir)) @pytest.fixture def sample_yaml_data(self): - """Sample YAML configuration data for Stage 3.""" + """Sample YAML configuration data for time gridding.""" return { 'name': 'test_mooring', 'waterdepth': 1000, @@ -113,8 +112,8 @@ def mock_datasets(self): return datasets def test_init(self, temp_dir): - """Test Stage3Processor initialization.""" - processor = Stage3Processor(str(temp_dir)) + """Test TimeGriddingProcessor initialization.""" + processor = TimeGriddingProcessor(str(temp_dir)) assert processor.base_dir == temp_dir assert processor.log_file is None @@ -129,7 +128,7 @@ def test_setup_logging(self, processor, temp_dir): assert processor.log_file is not None assert processor.log_file.parent == output_path assert mooring_name in processor.log_file.name - assert "stage3.mooring.log" in processor.log_file.name + assert "time_gridding.mooring.log" in processor.log_file.name def test_ensure_instrument_metadata(self, processor): """Test metadata addition to datasets.""" @@ -167,6 +166,47 @@ def test_clean_dataset_variables(self, processor): assert 'density' not in result.variables assert 'potential_temperature' not in result.variables + def test_apply_time_filtering_single(self, processor, mock_datasets): + """Test time filtering on individual instrument datasets.""" + # Test with no filtering + result = processor._apply_time_filtering_single(mock_datasets[0], filter_type=None) + assert result is mock_datasets[0] # Should return same dataset unchanged + + # Test with lowpass filtering (now implemented) + with patch.object(processor, '_log_print') as mock_log: + result = processor._apply_time_filtering_single(mock_datasets[0], filter_type='lowpass') + mock_log.assert_called() + + # Should contain lowpass filter messages + log_messages = [str(call) for call in mock_log.call_args_list] + lowpass_messages = [msg for msg in log_messages if 'Low-pass filter' in msg] + assert len(lowpass_messages) > 0 + + # Result should be a dataset (may be same if filtering failed due to insufficient data) + assert isinstance(result, type(mock_datasets[0])) + + # Test with detide filter (should warn about not implemented, then use lowpass) + with patch.object(processor, '_log_print') as mock_log: + result = processor._apply_time_filtering_single(mock_datasets[0], filter_type='detide') + mock_log.assert_called() + + # Should contain warning about harmonic de-tiding not implemented + warning_calls = [call for call in mock_log.call_args_list if 'not yet implemented' in str(call)] + assert len(warning_calls) > 0 + + # Result should be a dataset (filtered with lowpass as substitute) + assert isinstance(result, type(mock_datasets[0])) + + # Test with unknown filter type (should warn and return unchanged) + with patch.object(processor, '_log_print') as mock_log: + result = processor._apply_time_filtering_single(mock_datasets[0], filter_type='unknown_filter') + mock_log.assert_called() + + # Should contain warning about unknown filter + warning_calls = [call for call in mock_log.call_args_list if 'Unknown filter type' in str(call)] + assert len(warning_calls) > 0 + assert result is mock_datasets[0] # Should return unchanged + def test_analyze_timing_info(self, processor, mock_datasets): """Test timing analysis across multiple datasets.""" with patch.object(processor, '_log_print'): @@ -286,8 +326,8 @@ def test_encode_instrument_as_flags(self, processor): assert 'instrument_names' in result.attrs -class TestStage3Integration: - """Integration tests for Stage 3 processing.""" +class TestTimeGriddingIntegration: + """Integration tests for time gridding processing.""" @pytest.fixture def test_data_setup(self, tmp_path): @@ -340,10 +380,10 @@ def test_data_setup(self, tmp_path): 'yaml_data': yaml_data } - def test_full_stage3_processing(self, test_data_setup): - """Test complete Stage 3 processing workflow.""" + def test_full_time_gridding_processing(self, test_data_setup): + """Test complete time gridding processing workflow.""" setup = test_data_setup - processor = Stage3Processor(str(setup['base_dir'])) + processor = TimeGriddingProcessor(str(setup['base_dir'])) # Process the mooring result = processor.process_mooring("test_mooring") @@ -394,14 +434,14 @@ def test_missing_instruments_warning(self, test_data_setup): with open(setup['config_file'], 'w') as f: yaml.dump(yaml_data, f) - processor = Stage3Processor(str(setup['base_dir'])) + processor = TimeGriddingProcessor(str(setup['base_dir'])) result = processor.process_mooring("test_mooring") # Should still succeed with available instruments assert result is True # Check log file mentions missing instrument - log_files = list(setup['proc_dir'].glob("*_stage3.mooring.log")) + log_files = list(setup['proc_dir'].glob("*_time_gridding.mooring.log")) assert len(log_files) == 1 log_content = log_files[0].read_text() assert "Missing instruments" in log_content @@ -410,18 +450,37 @@ def test_missing_instruments_warning(self, test_data_setup): def test_different_sampling_rates_warning(self, test_data_setup): """Test warnings about different sampling rates.""" setup = test_data_setup - processor = Stage3Processor(str(setup['base_dir'])) + processor = TimeGriddingProcessor(str(setup['base_dir'])) # Process (datasets have 10min and 5min intervals) result = processor.process_mooring("test_mooring") assert result is True # Check log mentions timing analysis - log_files = list(setup['proc_dir'].glob("*_stage3.mooring.log")) + log_files = list(setup['proc_dir'].glob("*_time_gridding.mooring.log")) log_content = log_files[0].read_text() assert "TIMING ANALYSIS" in log_content assert "min intervals" in log_content + def test_filtering_parameter_detide(self, test_data_setup): + """Test filtering integration with detide filter (not yet implemented).""" + setup = test_data_setup + processor = TimeGriddingProcessor(str(setup['base_dir'])) + + # Test with detide filtering (should show "not yet implemented" warning) + result = processor.process_mooring("test_mooring", filter_type='detide') + + # Check that processing succeeded + assert result is True + + # Check the log file contains the expected warning + log_files = list(setup['proc_dir'].glob("*_time_gridding.mooring.log")) + assert len(log_files) == 1 + + log_content = log_files[0].read_text() + assert 'not yet implemented' in log_content + assert 'Harmonic de-tiding not yet implemented' in log_content + def test_no_valid_datasets(self, tmp_path): """Test handling when no valid datasets are found.""" base_dir = tmp_path / "test_data" @@ -438,7 +497,7 @@ def test_no_valid_datasets(self, tmp_path): with open(config_file, 'w') as f: yaml.dump(yaml_data, f) - processor = Stage3Processor(str(base_dir)) + processor = TimeGriddingProcessor(str(base_dir)) result = processor.process_mooring("test_mooring") assert result is False @@ -446,7 +505,7 @@ def test_no_valid_datasets(self, tmp_path): def test_custom_variables_to_keep(self, test_data_setup): """Test processing with custom variable selection.""" setup = test_data_setup - processor = Stage3Processor(str(setup['base_dir'])) + processor = TimeGriddingProcessor(str(setup['base_dir'])) # Process with only temperature result = processor.process_mooring("test_mooring", vars_to_keep=['temperature']) @@ -463,28 +522,27 @@ def test_custom_variables_to_keep(self, test_data_setup): class TestConvenienceFunctions: """Test convenience functions.""" - @patch('oceanarray.stage3.Stage3Processor') - def test_stage3_mooring(self, mock_processor_class): - """Test backwards compatibility function.""" + @patch('oceanarray.time_gridding.TimeGriddingProcessor') + def test_time_gridding_mooring(self, mock_processor_class): + """Test convenience function.""" mock_processor = Mock() mock_processor.process_mooring.return_value = True mock_processor_class.return_value = mock_processor - result = stage3_mooring("test_mooring", "/test/dir", file_suffix='_use') + result = time_gridding_mooring("test_mooring", "/test/dir", file_suffix='_use') assert result is True mock_processor_class.assert_called_once_with("/test/dir") - mock_processor.process_mooring.assert_called_once_with("test_mooring", None, '_use') - @patch('oceanarray.stage3.Stage3Processor') - def test_process_multiple_moorings_stage3(self, mock_processor_class): + @patch('oceanarray.time_gridding.TimeGriddingProcessor') + def test_process_multiple_moorings_time_gridding(self, mock_processor_class): """Test batch processing function.""" mock_processor = Mock() mock_processor.process_mooring.side_effect = [True, False, True] mock_processor_class.return_value = mock_processor moorings = ["mooring1", "mooring2", "mooring3"] - results = process_multiple_moorings_stage3(moorings, "/test/dir", file_suffix='_use') + results = process_multiple_moorings_time_gridding(moorings, "/test/dir", file_suffix='_use') expected = { "mooring1": True, @@ -504,7 +562,7 @@ def test_missing_config_file(self, tmp_path): proc_dir = base_dir / "moor" / "proc" / "test_mooring" proc_dir.mkdir(parents=True) - processor = Stage3Processor(str(base_dir)) + processor = TimeGriddingProcessor(str(base_dir)) result = processor.process_mooring("test_mooring") assert result is False @@ -519,19 +577,19 @@ def test_invalid_yaml_file(self, tmp_path): invalid_yaml = proc_dir / "test_mooring.mooring.yaml" invalid_yaml.write_text("invalid: yaml: content: [") - processor = Stage3Processor(str(base_dir)) + processor = TimeGriddingProcessor(str(base_dir)) result = processor.process_mooring("test_mooring") assert result is False def test_processing_directory_not_found(self, tmp_path): """Test handling when processing directory doesn't exist.""" - processor = Stage3Processor(str(tmp_path)) + processor = TimeGriddingProcessor(str(tmp_path)) result = processor.process_mooring("nonexistent_mooring") assert result is False if __name__ == "__main__": - # Version 1.0 - Initial Stage 3 test suite + # Version 1.1 - Time gridding test suite pytest.main([__file__, "-v", "--tb=short"]) From b0f39cbbb727aca9fe84b5c21405fde398226f20 Mon Sep 17 00:00:00 2001 From: Eleanor Frajka-Williams Date: Sun, 7 Sep 2025 14:19:28 +0200 Subject: [PATCH 3/3] [FEAT] Added demo --- notebooks/demo_step1.ipynb | 856 +++++++++++++++++++++++++++---------- 1 file changed, 627 insertions(+), 229 deletions(-) diff --git a/notebooks/demo_step1.ipynb b/notebooks/demo_step1.ipynb index eda62d1..9305a06 100644 --- a/notebooks/demo_step1.ipynb +++ b/notebooks/demo_step1.ipynb @@ -1,232 +1,630 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "71edb016", - "metadata": {}, - "source": [ - "## Demo: Stage3 - Combine individual instruments to one xarray dataset\n", - "\n", - "Here we want to treat one mooring as one dataset.\n", - "\n", - "The time sampling differs between instruments -- for the *first guess* (24 Aug 2025) we will use a simple interpolation onto a common grid.\n", - "\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Time Gridding and Optional Filtering Demo\n", + "\n", + "This notebook demonstrates the Step 1 processing workflow for mooring data:\n", + "- Loading multiple instrument datasets\n", + "- Optional time-domain filtering (applied BEFORE interpolation)\n", + "- Interpolating onto a common time grid\n", + "- Combining into a unified mooring dataset\n", + "\n", + "**Key Point**: Filtering is applied to individual instrument records on their native time grids BEFORE interpolation to preserve data integrity.\n", + "\n", + "Version: 1.0 \n", + "Date: 2025-09-07" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from pathlib import Path\n", + "import yaml\n", + "\n", + "# Import the time gridding module\n", + "from oceanarray.time_gridding import (\n", + " TimeGriddingProcessor,\n", + " time_gridding_mooring,\n", + " process_multiple_moorings_time_gridding\n", + ")\n", + "\n", + "# Set up plotting\n", + "plt.style.use('default')\n", + "sns.set_palette(\"husl\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Setup and Configuration\n", + "\n", + "First, let's set up our data paths and examine the mooring configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set your data paths here\n", + "basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/'\n", + "mooring_name = 'dsE_1_2018'\n", + "\n", + "# Construct paths\n", + "proc_dir = Path(basedir) / 'moor' / 'proc' / mooring_name\n", + "config_file = proc_dir / f\"{mooring_name}.mooring.yaml\"\n", + "\n", + "print(f\"Processing directory: {proc_dir}\")\n", + "print(f\"Configuration file: {config_file}\")\n", + "print(f\"Config exists: {config_file.exists()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and examine the mooring configuration\n", + "if config_file.exists():\n", + " with open(config_file, 'r') as f:\n", + " config = yaml.safe_load(f)\n", + "\n", + " print(\"Mooring Configuration:\")\n", + " print(f\"Name: {config['name']}\")\n", + " print(f\"Water depth: {config.get('waterdepth', 'unknown')} m\")\n", + " print(f\"Location: {config.get('latitude', 'unknown')}°N, {config.get('longitude', 'unknown')}°E\")\n", + " print(f\"\\nInstruments ({len(config.get('instruments', []))}):\")\n", + "\n", + " for i, inst in enumerate(config.get('instruments', [])):\n", + " print(f\" {i+1}. {inst.get('instrument', 'unknown')} \"\n", + " f\"(serial: {inst.get('serial num.', 'unknown')}) at {inst.get('depth', 'unknown')} m\")\n", + "else:\n", + " print(\"Configuration file not found!\")\n", + " print(\"Please check your data path and mooring name.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Examine Individual Instrument Files\n", + "\n", + "Let's look at the individual instrument files before processing to understand the different sampling rates and data characteristics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find and examine individual instrument files\n", + "file_suffix = \"_use\"\n", + "instrument_files = []\n", + "instrument_datasets = []\n", + "rows = []\n", + "\n", + "if config_file.exists():\n", + " for inst_config in config.get(\"instruments\", []):\n", + " instrument_type = inst_config.get(\"instrument\", \"unknown\")\n", + " serial = inst_config.get(\"serial\", 0)\n", + " depth = inst_config.get(\"depth\", 0)\n", + "\n", + " # Look for the file\n", + " filename = f\"{mooring_name}_{serial}{file_suffix}.nc\"\n", + " filepath = proc_dir / instrument_type / filename\n", + "\n", + " if filepath.exists():\n", + " ds = xr.open_dataset(filepath)\n", + " instrument_files.append(filepath)\n", + " instrument_datasets.append(ds)\n", + "\n", + " # Time coverage\n", + " t0, t1 = ds.time.values[0], ds.time.values[-1]\n", + " npoints = len(ds.time)\n", + "\n", + " # Median sampling interval\n", + " time_diff = np.diff(ds.time.values) / np.timedelta64(1, \"m\") # in minutes\n", + " median_interval = np.nanmedian(time_diff)\n", + " if median_interval > 1:\n", + " sampling = f\"{median_interval:.1f} min\"\n", + " else:\n", + " sampling = f\"{median_interval*60:.1f} sec\"\n", + "\n", + " # Collect a row for the table\n", + " rows.append(\n", + " {\n", + " \"Instrument\": instrument_type,\n", + " \"Serial\": serial,\n", + " \"Depth [m]\": depth,\n", + " \"File\": filepath.name,\n", + " \"Start\": str(t0)[:19],\n", + " \"End\": str(t1)[:19],\n", + " \"Points\": npoints,\n", + " \"Sampling\": sampling,\n", + " \"Variables\": \", \".join(list(ds.data_vars)),\n", + " }\n", + " )\n", + " else:\n", + " rows.append(\n", + " {\n", + " \"Instrument\": instrument_type,\n", + " \"Serial\": serial,\n", + " \"Depth [m]\": depth,\n", + " \"File\": \"MISSING\",\n", + " \"Start\": \"\",\n", + " \"End\": \"\",\n", + " \"Points\": 0,\n", + " \"Sampling\": \"\",\n", + " \"Variables\": \"\",\n", + " }\n", + " )\n", + "\n", + " # Make a DataFrame summary\n", + " summary = pd.DataFrame(rows)\n", + " pd.set_option(\"display.max_colwidth\", 80) # allow long var lists\n", + " print(summary.to_markdown(index=False))\n", + "\n", + " print(f\"\\nFound {len(instrument_datasets)} instrument datasets\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Process with Time Gridding (No Filtering)\n", + "\n", + "First, let's process the mooring without any filtering to see the basic time gridding functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process without filtering\n", + "print(\"Processing mooring with time gridding only (no filtering)...\")\n", + "print(\"=\"*60)\n", + "\n", + "result = time_gridding_mooring(mooring_name, basedir, file_suffix='_use')\n", + "\n", + "print(f\"\\nProcessing result: {'SUCCESS' if result else 'FAILED'}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and examine the combined dataset\n", + "output_file = proc_dir / f\"{mooring_name}_mooring_use.nc\"\n", + "\n", + "if output_file.exists():\n", + " print(f\"Output file exists: {output_file}\")\n", + "\n", + " # Load the combined dataset\n", + " combined_ds = xr.open_dataset(output_file)\n", + "else:\n", + " print(\"Output file not found - processing may have failed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Visualize Combined Dataset\n", + "\n", + "Let's plot the combined dataset to see how the different instruments look on the common time grid." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_combined_timeseries(\n", + " combined_ds,\n", + " variables=(\"temperature\", \"salinity\", \"pressure\"),\n", + " cmap_name=\"viridis\",\n", + " line_alpha=0.8,\n", + " line_width=1.2,\n", + " percentile_limits=(1, 99),\n", + "):\n", + " \"\"\"\n", + " Plot selected variables from a combined mooring dataset as stacked time series.\n", + "\n", + " Parameters\n", + " ----------\n", + " combined_ds : xarray.Dataset\n", + " Must have dims: time, N_LEVELS. Optional coords: nominal_depth, serial_number.\n", + " variables : iterable[str]\n", + " Variable names to try to plot (if present in dataset).\n", + " cmap_name : str\n", + " Matplotlib colormap name for coloring by instrument level.\n", + " line_alpha : float\n", + " Line transparency.\n", + " line_width : float\n", + " Line width.\n", + " percentile_limits : (low, high)\n", + " Percentiles to use for automatic y-limits (e.g., (1, 99)).\n", + " \"\"\"\n", + " if combined_ds is None:\n", + " print(\"Combined dataset not available.\")\n", + " return None, None\n", + " n_levels = combined_ds.sizes.get(\"N_LEVELS\")\n", + " if n_levels is None:\n", + " raise ValueError(\"Dataset must contain dimension 'N_LEVELS'.\")\n", + "\n", + " available = [v for v in variables if v in combined_ds.data_vars]\n", + " if not available:\n", + " print(\"No requested variables found to plot.\")\n", + " return None, None\n", + "\n", + " # Colors by level\n", + " cmap = plt.get_cmap(cmap_name)\n", + " colors = cmap(np.linspace(0, 1, n_levels))\n", + "\n", + " fig, axes = plt.subplots(\n", + " len(available), 1, figsize=(14, 3.6 * len(available)), sharex=True, constrained_layout=True\n", + " )\n", + " if len(available) == 1:\n", + " axes = [axes]\n", + "\n", + " depth_arr = combined_ds.get(\"nominal_depth\")\n", + " serial_arr = combined_ds.get(\"serial_number\")\n", + "\n", + " first_axis = True\n", + " for ax, var in zip(axes, available):\n", + " values_for_limits = []\n", + " for level in range(n_levels):\n", + " depth = None if depth_arr is None else depth_arr.values[level]\n", + " serial = None if serial_arr is None else serial_arr.values[level]\n", + " label = None\n", + " if first_axis:\n", + " if depth is not None and np.isfinite(depth):\n", + " label = f\"Serial {serial} ({int(depth)} m)\" if serial is not None else f\"({int(depth)} m)\"\n", + " elif serial is not None:\n", + " label = f\"Serial {serial}\"\n", + "\n", + " da = combined_ds[var].isel(N_LEVELS=level)\n", + " da = da.where(np.isfinite(da), drop=True)\n", + " if da.size == 0:\n", + " continue\n", + "\n", + " values_for_limits.append(da.values)\n", + "\n", + " ax.plot(\n", + " da[\"time\"].values,\n", + " da.values,\n", + " color=colors[level],\n", + " alpha=line_alpha,\n", + " linewidth=line_width,\n", + " label=label,\n", + " )\n", + "\n", + " # Set labels and grid\n", + " ax.set_ylabel(var.replace(\"_\", \" \").title())\n", + " ax.grid(True, alpha=0.3)\n", + " ax.set_title(f\"{var.replace('_', ' ').title()} — Combined Time Grid\")\n", + "\n", + " # Legend only once\n", + " if first_axis:\n", + " ax.legend(ncol=3, fontsize=8, loc=\"upper right\", frameon=False)\n", + " first_axis = False\n", + "\n", + " # Auto y-limits based on percentiles\n", + " if values_for_limits:\n", + " flat = np.concatenate(values_for_limits)\n", + " low, high = np.nanpercentile(flat, percentile_limits)\n", + " ax.set_ylim(low, high)\n", + "\n", + " axes[-1].set_xlabel(\"Time\")\n", + " return fig, axes\n", + "\n", + "# Usage:\n", + "if 'combined_ds' in locals():\n", + " plot_combined_timeseries(combined_ds)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Process with Low-pass Filtering (RAPID-style)\n", + "\n", + "Now let's apply the RAPID-style 2-day low-pass filter to remove tidal and inertial variability. Remember: **filtering is applied to individual instruments BEFORE interpolation**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process with RAPID-style low-pass filtering\n", + "print(\"Processing mooring with 2-day low-pass filtering (RAPID-style)...\")\n", + "print(\"=\"*60)\n", + "print(\"IMPORTANT: Filtering is applied to each instrument on its native time grid\")\n", + "print(\"BEFORE interpolation to preserve data integrity.\")\n", + "print()\n", + "\n", + "filter_params = {\n", + " 'cutoff_days': 2.0, # 2-day cutoff\n", + " 'order': 6 # 6th order Butterworth\n", + "}\n", + "\n", + "result_filtered = time_gridding_mooring(\n", + " mooring_name, basedir,\n", + " file_suffix='_use',\n", + " filter_type='lowpass',\n", + " filter_params=filter_params\n", + ")\n", + "\n", + "print(f\"\\nFiltered processing result: {'SUCCESS' if result_filtered else 'FAILED'}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the filtered dataset\n", + "filtered_output_file = proc_dir / f\"{mooring_name}_mooring_use_lowpass.nc\"\n", + "\n", + "if filtered_output_file.exists():\n", + " print(f\"Filtered output file created: {filtered_output_file}\")\n", + "\n", + " # Load the filtered dataset\n", + " filtered_ds = xr.open_dataset(filtered_output_file)\n", + "\n", + " print(\"\\nFiltered Dataset Attributes:\")\n", + " filter_attrs = {k: v for k, v in filtered_ds.attrs.items()\n", + " if 'filter' in k.lower()}\n", + " for key, value in filter_attrs.items():\n", + " print(f\" {key}: {value}\")\n", + "\n", + " print(f\"\\nDataset shape: {dict(filtered_ds.dims)}\")\n", + "else:\n", + " print(\"Filtered output file not found\")\n", + " filtered_ds = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Compare Filtered vs Unfiltered Data\n", + "\n", + "Let's compare the original and filtered data to see the effect of the low-pass filter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if 'combined_ds' in locals() and filtered_ds is not None:\n", + " # Compare filtered vs unfiltered for a subset of data\n", + " # Select a 10-day window for detailed comparison\n", + " start_time = combined_ds.time.values[len(combined_ds.time)//4] # Start 1/4 through\n", + " end_time = start_time + np.timedelta64(10, 'D') # 10-day window\n", + "\n", + " # Select subset\n", + " subset_orig = combined_ds.sel(time=slice(start_time, end_time))\n", + " subset_filt = filtered_ds.sel(time=slice(start_time, end_time))\n", + "\n", + " # Plot comparison for temperature\n", + " if 'temperature' in subset_orig.data_vars:\n", + " fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)\n", + "\n", + " # Choose a representative level (first one with data)\n", + " level = 0\n", + " depth = subset_orig.nominal_depth.values[level]\n", + " serial = subset_orig.serial_number.values[level]\n", + "\n", + " # Original data\n", + " orig_temp = subset_orig.temperature.isel(N_LEVELS=level)\n", + " axes[0].plot(orig_temp.time, orig_temp, 'b-', alpha=0.7, linewidth=1,\n", + " label='Original')\n", + " axes[0].set_ylabel('Temperature (°C)')\n", + " axes[0].set_title(f'Original Data - Serial {serial} at {depth}m')\n", + " axes[0].grid(True, alpha=0.3)\n", + " axes[0].legend()\n", + "\n", + " # Filtered data\n", + " filt_temp = subset_filt.temperature.isel(N_LEVELS=level)\n", + " axes[1].plot(filt_temp.time, filt_temp, 'r-', alpha=0.7, linewidth=1.5,\n", + " label='2-day Low-pass Filtered')\n", + " axes[1].set_ylabel('Temperature (°C)')\n", + " axes[1].set_xlabel('Time')\n", + " axes[1].set_title(f'Filtered Data - Serial {serial} at {depth}m')\n", + " axes[1].grid(True, alpha=0.3)\n", + " axes[1].legend()\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + " # Overlay comparison\n", + " fig, ax = plt.subplots(1, 1, figsize=(14, 6))\n", + " ax.plot(orig_temp.time, orig_temp, 'b-', alpha=0.5, linewidth=0.8,\n", + " label='Original')\n", + " ax.plot(filt_temp.time, filt_temp, 'r-', alpha=0.8, linewidth=2,\n", + " label='2-day Low-pass Filtered')\n", + " ax.set_ylabel('Temperature (°C)')\n", + " ax.set_xlabel('Time')\n", + " ax.set_title(f'Filtering Comparison - Serial {serial} at {depth}m')\n", + " ax.legend()\n", + " ax.grid(True, alpha=0.3)\n", + " plt.tight_layout()\n", + " plt.show()\n", + "else:\n", + " print(\"Cannot compare - one or both datasets not available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Spectral Analysis: Effect of Filtering\n", + "\n", + "Let's examine the spectral characteristics to see how the filter affects different frequency components." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if 'combined_ds' in locals() and filtered_ds is not None:\n", + " from scipy import signal\n", + "\n", + " # Select a level with good data coverage\n", + " level = 0\n", + "\n", + " # Get temperature data\n", + " if 'temperature' in combined_ds.data_vars:\n", + " orig_temp = combined_ds.temperature.isel(N_LEVELS=level).dropna('time')\n", + " filt_temp = filtered_ds.temperature.isel(N_LEVELS=level).dropna('time')\n", + "\n", + " if len(orig_temp) > 100: # Ensure sufficient data\n", + " # Calculate sampling rate\n", + " dt_hours = float(np.median(np.diff(orig_temp.time.values)) / np.timedelta64(1, 'h'))\n", + " fs = 1.0 / dt_hours # samples per hour\n", + "\n", + " # Compute power spectral density\n", + " f_orig, psd_orig = signal.welch(orig_temp.values, fs=fs, nperseg=min(256, len(orig_temp)//4))\n", + " f_filt, psd_filt = signal.welch(filt_temp.values, fs=fs, nperseg=min(256, len(filt_temp)//4))\n", + "\n", + " # Convert frequency to period in days\n", + " period_orig = 1.0 / (f_orig * 24) # days\n", + " period_filt = 1.0 / (f_filt * 24) # days\n", + "\n", + " # Plot power spectral density\n", + " fig, ax = plt.subplots(1, 1, figsize=(12, 6))\n", + "\n", + " ax.loglog(period_orig[1:], psd_orig[1:], 'b-', alpha=0.7,\n", + " label='Original', linewidth=1.5)\n", + " ax.loglog(period_filt[1:], psd_filt[1:], 'r-', alpha=0.8,\n", + " label='2-day Low-pass Filtered', linewidth=2)\n", + "\n", + " # Mark important periods\n", + " ax.axvline(2.0, color='gray', linestyle='--', alpha=0.7,\n", + " label='2-day cutoff')\n", + " ax.axvline(1.0, color='gray', linestyle=':', alpha=0.7,\n", + " label='1-day (diurnal)')\n", + " ax.axvline(0.5, color='gray', linestyle=':', alpha=0.7,\n", + " label='12-hour (semidiurnal)')\n", + "\n", + " ax.set_xlabel('Period (days)')\n", + " ax.set_ylabel('Power Spectral Density')\n", + " ax.set_title('Spectral Analysis: Effect of 2-day Low-pass Filter')\n", + " ax.legend()\n", + " ax.grid(True, alpha=0.3)\n", + " ax.set_xlim(0.1, 100)\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + " print(f\"Spectral analysis completed for level {level}\")\n", + " print(f\"Sampling rate: {dt_hours:.2f} hours\")\n", + " print(f\"Data length: {len(orig_temp)} points\")\n", + " else:\n", + " print(\"Insufficient data for spectral analysis\")\n", + " else:\n", + " print(\"Temperature data not available for spectral analysis\")\n", + "else:\n", + " print(\"Cannot perform spectral analysis - datasets not available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Multiple Mooring Processing Example\n", + "\n", + "The time gridding module also supports batch processing of multiple moorings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Base directory containing the mooring data\n", + "basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/'\n", + "\n", + "# Example of processing multiple moorings\n", + "# (This will only work if you have multiple moorings in your dataset)\n", + "\n", + "# List available moorings\n", + "moor_base = Path(basedir) / 'moor' / 'proc'\n", + "available_moorings = [d.name for d in moor_base.iterdir() if d.is_dir()]\n", + "\n", + "print(f\"Available moorings in {moor_base}:\")\n", + "for mooring in available_moorings[:5]: # Show first 5\n", + " print(f\" - {mooring}\")\n", + "\n", + "if len(available_moorings) > 5:\n", + " print(f\" ... and {len(available_moorings)-5} more\")\n", + "\n", + "# Example batch processing (commented out to avoid running on all moorings)\n", + "moorings_to_process = ['dsE_1_2018'] # Add your mooring names\n", + "\n", + "results = process_multiple_moorings_time_gridding(\n", + " moorings_to_process,\n", + " basedir,\n", + ")\n", + "\n", + "print(\"Batch processing results:\")\n", + "for mooring, success in results.items():\n", + " status = \"SUCCESS\" if success else \"FAILED\"\n", + " print(f\" {mooring}: {status}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv (3.11.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ff1f4d0", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import yaml\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import xarray as xr\n", - "from datetime import datetime\n", - "from ctd_tools.writers import NetCdfWriter\n", - "\n", - "from oceanarray import tools\n", - "\n", - "moorlist = ['ds2_X_2012','ds2_X_2017','ds2_X_2018',\n", - " 'ds8_1_2012','ds9_1_2012','ds10_1_2012', 'ds11_1_2012','ds12_1_2012',\n", - " 'ds13_1_2012','ds14_1_2012','ds15_1_2012','ds16_1_2012','ds17_1_2012',\n", - " 'ds19_1_2012','ds18_1_2012','ds28_1_2017',\n", - " 'dsA_1_2018','dsB_1_2018','dsC_1_2018', 'dsD_1_2018','dsE_1_2018','dsF_1_2018',\n", - " 'dsM1_1_2017','dsM2_1_2017','dsM3_1_2017','dsM4_1_2017','dsM5_1_2017']\n", - "moorlist = ['dsE_1_2018']" - ] - }, - { - "cell_type": "markdown", - "id": "24e4a3c8", - "metadata": {}, - "source": [ - "## Load data for one mooring into datasets, list of xarray datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84055eec", - "metadata": {}, - "outputs": [], - "source": [ - "from oceanarray.time_gridding import Stage3Processor, process_multiple_moorings_stage3\n", - "\n", - "# Simple usage\n", - "moorlist = ['dsE_1_2018']\n", - "basedir = '/Users/eddifying/Dropbox/data/ifmro_mixsed/ds_data_eleanor/'\n", - "results = process_multiple_moorings_stage3(moorlist, basedir, file_suffix='_use')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c60445eb", - "metadata": {}, - "outputs": [], - "source": [ - "# Access individual results\n", - "print(results['dsE_1_2018']) # True or False\n", - "\n", - "# Check if a specific mooring succeeded\n", - "if results['dsE_1_2018']:\n", - " print(\"dsE_1_2018 processed successfully\")\n", - "else:\n", - " print(\"dsE_1_2018 failed to process\")\n", - "\n", - "# Iterate through all results\n", - "for mooring_name, success in results.items():\n", - " status = \"SUCCESS\" if success else \"FAILED\"\n", - " print(f\"{mooring_name}: {status}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c23478db", - "metadata": {}, - "outputs": [], - "source": [ - "# For a successful mooring, the combined data is saved as:\n", - "# {proc_dir}/{mooring_name}/{mooring_name}_mooring_use.nc\n", - "\n", - "# Load the combined dataset\n", - "import xarray as xr\n", - "\n", - "mooring_name = 'dsE_1_2018'\n", - "if results[mooring_name]: # Only load if processing succeeded\n", - " combined_file = f\"{basedir}/moor/proc/{mooring_name}/{mooring_name}_mooring_use.nc\"\n", - " combined_ds = xr.open_dataset(combined_file)\n", - "\n", - " print(f\"Combined dataset shape: {dict(combined_ds.dims)}\")\n", - " print(f\"Variables: {list(combined_ds.data_vars)}\")\n", - " print(f\"Instruments: {combined_ds.nominal_depth.values}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5b2b20b", - "metadata": {}, - "outputs": [], - "source": [ - "# The combined dataset has structure like:\n", - "# Dimensions: (time: 8640, N_LEVELS: 3)\n", - "# Variables: temperature(time, N_LEVELS), salinity(time, N_LEVELS), etc.\n", - "\n", - "# Access temperature data for all instruments\n", - "temp_data = combined_ds.temperature # Shape: (time, N_LEVELS)\n", - "\n", - "# Get temperature for a specific instrument level\n", - "temp_level_0 = combined_ds.temperature[:, 0] # First instrument\n", - "temp_level_1 = combined_ds.temperature[:, 1] # Second instrument\n", - "\n", - "# Access metadata for each level\n", - "depths = combined_ds.nominal_depth.values\n", - "serials = combined_ds.serial_number.values\n", - "clock_offsets = combined_ds.clock_offset.values\n", - "\n", - "print(f\"Instrument depths: {depths}\")\n", - "print(f\"Serial numbers: {serials}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db454428", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "def plot_mooring_temperature(combined_ds, mooring_name):\n", - " \"\"\"Plot temperature records from all instruments on a mooring.\"\"\"\n", - "\n", - " # Extract metadata for legend labels\n", - " instruments = combined_ds.instrument_id.values if 'instrument_id' in combined_ds else None\n", - " serials = combined_ds.serial_number.values\n", - " depths = combined_ds.nominal_depth.values\n", - "\n", - " # Get instrument names from global attributes if available\n", - " if 'instrument_names' in combined_ds.attrs:\n", - " instrument_names = combined_ds.attrs['instrument_names'].split(', ')\n", - " else:\n", - " instrument_names = ['unknown'] * len(serials)\n", - "\n", - " # Create the plot\n", - " fig, ax = plt.subplots(figsize=(12, 6))\n", - "\n", - " # Plot each instrument's temperature record\n", - " for i in range(combined_ds.dims['N_LEVELS']):\n", - " temp_data = combined_ds.temperature[:, i]\n", - "\n", - " # Skip if all NaN\n", - " if np.all(np.isnan(temp_data)):\n", - " continue\n", - "\n", - " # Create legend label\n", - " if i < len(instrument_names):\n", - " instrument_type = instrument_names[i]\n", - " else:\n", - " instrument_type = 'unknown'\n", - "\n", - " label = f\"{instrument_type}:{int(serials[i])} ({depths[i]:.0f}m)\"\n", - "\n", - " # Plot the data\n", - " ax.plot(combined_ds.time, temp_data, label=label, linewidth=0.8)\n", - "\n", - " # Formatting\n", - " ax.set_xlabel('Time')\n", - " ax.set_ylabel('Temperature (°C)')\n", - " ax.set_title(f'{mooring_name} - Temperature Records')\n", - " ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " ax.grid(True, alpha=0.3)\n", - "\n", - " # Rotate x-axis labels for better readability\n", - " plt.xticks(rotation=45)\n", - " plt.tight_layout()\n", - "\n", - " return fig, ax\n", - "\n", - "# Usage example:\n", - "mooring_name = 'dsE_1_2018'\n", - "if results[mooring_name]:\n", - " combined_file = f\"{basedir}/moor/proc/{mooring_name}/{mooring_name}_mooring_use.nc\"\n", - " combined_ds = xr.open_dataset(combined_file)\n", - "\n", - " fig, ax = plot_mooring_temperature(combined_ds, mooring_name)\n", - " plt.show()\n", - "\n", - " # Optional: save the plot\n", - " # plt.savefig(f'{mooring_name}_temperature_timeseries.png', dpi=150, bbox_inches='tight')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv (3.11.7)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 4 }