diff --git a/.github/workflows/ci_conda_constructor.yml b/.github/workflows/ci_conda_constructor.yml index f1af78e..a88cf6e 100644 --- a/.github/workflows/ci_conda_constructor.yml +++ b/.github/workflows/ci_conda_constructor.yml @@ -19,7 +19,7 @@ jobs: ARCH: x86_64 OS_NAME: "Linux" # x86_64 MACOS - - os: macos-13 + - os: macos-15-intel ARCH: x86_64 OS_NAME: "MacOSX" # ARM MACOS diff --git a/.github/workflows/ci_conda_constructor_prerelease.yml b/.github/workflows/ci_conda_constructor_prerelease.yml index b90e858..908d7a3 100644 --- a/.github/workflows/ci_conda_constructor_prerelease.yml +++ b/.github/workflows/ci_conda_constructor_prerelease.yml @@ -3,7 +3,7 @@ on: workflow_dispatch: inputs: branch: - description: 'Branch of Release' + description: 'Git ref to install into the prerelease environment (branch, tag, or commit SHA)' default: 'develop' required: true @@ -25,7 +25,7 @@ jobs: ARCH: x86_64 OS_NAME: "Linux" # x86_64 MACOS - - os: macos-13 + - os: macos-15-intel ARCH: x86_64 OS_NAME: "MacOSX" # ARM MACOS @@ -52,6 +52,29 @@ jobs: run: | cd scripts bash build_dev.sh ${{ inputs.branch }} + + - name: Test installer + shell: bash -l {0} + run: | + installer="$(find scripts -maxdepth 1 -type f -name 'ecodata*.sh' | head -n 1)" + if [ -z "$installer" ]; then + echo "No shell installer found in scripts/" + find scripts -maxdepth 1 -type f -name 'ecodata*' -print + exit 1 + fi + + prefix="$RUNNER_TEMP/ecodata-prerelease-install" + rm -rf "$prefix" + + bash "$installer" -b -f -p "$prefix" + + smoke_dir="$RUNNER_TEMP/ecodata-prerelease-smoke" + rm -rf "$smoke_dir" + mkdir -p "$smoke_dir" + cd "$smoke_dir" + + "$prefix/bin/python" -c "import ecodata; from ecodata.app.apps import applications; print('Installed ecodata:', getattr(ecodata, '__version__', 'unknown'), ecodata.__file__); print('Registered apps:', ', '.join(sorted(applications))); assert applications" + - name: upload artifacts uses: actions/upload-artifact@v4 with: @@ -88,8 +111,44 @@ jobs: run: | cd scripts .\build_dev.ps1 ${{ inputs.branch }} + + - name: Test installer + shell: pwsh + run: | + $installer = Get-ChildItem -Path scripts -Filter "ecodata*.exe" | Select-Object -First 1 + if (-not $installer) { + Write-Host "No Windows installer found in scripts/" + Get-ChildItem -Path scripts -Filter "ecodata*" + exit 1 + } + + $prefix = Join-Path $env:RUNNER_TEMP "ecodata-prerelease-install" + if (Test-Path $prefix) { + Remove-Item -Recurse -Force $prefix + } + + $process = Start-Process ` + -FilePath $installer.FullName ` + -ArgumentList "/S", "/InstallationType=JustMe", "/NoShortcuts=1", "/D=$prefix" ` + -NoNewWindow ` + -Wait ` + -PassThru + + if ($process.ExitCode -ne 0) { + throw "Installer failed with exit code $($process.ExitCode)" + } + + $smokeDir = Join-Path $env:RUNNER_TEMP "ecodata-prerelease-smoke" + if (Test-Path $smokeDir) { + Remove-Item -Recurse -Force $smokeDir + } + New-Item -ItemType Directory -Path $smokeDir | Out-Null + Set-Location $smokeDir + + & "$prefix\python.exe" -c "import ecodata; from ecodata.app.apps import applications; print('Installed ecodata:', getattr(ecodata, '__version__', 'unknown'), ecodata.__file__); print('Registered apps:', ', '.join(sorted(applications))); assert applications" + - name: upload artifacts uses: actions/upload-artifact@v4 with: path: scripts/ecodata* - name: ecodata-dev-${{ matrix.OS_NAME }}-${{ matrix.ARCH }} \ No newline at end of file + name: ecodata-dev-${{ matrix.OS_NAME }}-${{ matrix.ARCH }} diff --git a/.github/workflows/ci_deploy_win.yml b/.github/workflows/ci_deploy_win.yml index 31d4eae..7f37ffb 100644 --- a/.github/workflows/ci_deploy_win.yml +++ b/.github/workflows/ci_deploy_win.yml @@ -26,7 +26,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Setup Miniconda - - uses: conda-incubator/setup-miniconda@v3 + uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true activate-environment: eco-dev @@ -38,5 +38,5 @@ jobs: cd ~ pwd ls - pip install git+https://github.com/jemissik/ecodata.git@${{ github.sha }} - python -c 'import ecodata; print(ecodata.available()); import ecodata.app.apps.tracks_explorer_app' + conda run -n eco-dev python -m pip install --no-deps git+https://github.com/jemissik/ecodata.git@${{ github.sha }} + conda run -n eco-dev python -c 'import ecodata; print(ecodata.available()); import ecodata.app.apps.tracks_explorer_app' diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml index ae2da39..4020647 100644 --- a/.github/workflows/ci_linux.yml +++ b/.github/workflows/ci_linux.yml @@ -37,6 +37,10 @@ jobs: - name: Update base environment run: conda env update -n eco-dev -f ecodata-dev-env.yml + - name: Install ecodata editable + shell: bash -el {0} + run: python -m pip install --no-deps -e . + - name: Run tests shell: bash -el {0} run: | diff --git a/.github/workflows/ci_mac.yml b/.github/workflows/ci_mac.yml index 0c4fbf8..4fa27b8 100644 --- a/.github/workflows/ci_mac.yml +++ b/.github/workflows/ci_mac.yml @@ -36,6 +36,10 @@ jobs: - name: Update base environment run: conda env update -n eco-dev -f ecodata-dev-env.yml + - name: Install ecodata editable + shell: bash -el {0} + run: python -m pip install --no-deps -e . + - name: Run tests shell: bash -el {0} run: pytest -s diff --git a/.github/workflows/ci_win.yml b/.github/workflows/ci_win.yml index 97af7b3..c1981d4 100644 --- a/.github/workflows/ci_win.yml +++ b/.github/workflows/ci_win.yml @@ -27,15 +27,20 @@ jobs: steps: - uses: actions/checkout@v3 - name: Setup Miniconda - - uses: conda-incubator/setup-miniconda@v3 + uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true activate-environment: eco-dev environment-file: ecodata-env.yml - name: Update base environment + shell: pwsh run: conda env update -n eco-dev -f ecodata-dev-env.yml + - name: Install ecodata editable + shell: pwsh + run: conda run -n eco-dev python -m pip install --no-deps -e . + - name: Run tests - shell: sh -l {0} - run: pytest -s + shell: pwsh + run: conda run -n eco-dev pytest -s diff --git a/.readthedocs.yaml b/.readthedocs.yaml index af072f8..eea0ca0 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,10 +10,11 @@ build: jobs: post_install: - conda env update --name ${READTHEDOCS_VERSION} --file ecodata-dev-env.yml + - python -m pip install --no-deps -e . - conda list conda: environment: ecodata-env.yml # Build all downloadable formats -formats: all \ No newline at end of file +formats: all diff --git a/docs/apps/developer_guide.md b/docs/apps/developer_guide.md index 6a58eab..bd90ef6 100644 --- a/docs/apps/developer_guide.md +++ b/docs/apps/developer_guide.md @@ -31,6 +31,7 @@ If you have an existing conda installation, it's strongly recommended to make su ``` conda env update --name eco-dev --file ecodata-dev-env.yml + python -m pip install --no-deps -e . ``` ### Launching the apps diff --git a/docs/apps/images/annotation_engine_crop_controls.png b/docs/apps/images/annotation_engine_crop_controls.png new file mode 100644 index 0000000..e32a04f Binary files /dev/null and b/docs/apps/images/annotation_engine_crop_controls.png differ diff --git a/docs/apps/images/annotation_engine_crop_top.png b/docs/apps/images/annotation_engine_crop_top.png new file mode 100644 index 0000000..e62ea88 Binary files /dev/null and b/docs/apps/images/annotation_engine_crop_top.png differ diff --git a/docs/apps/images/annotation_engine_merge_csv.png b/docs/apps/images/annotation_engine_merge_csv.png new file mode 100644 index 0000000..f4ffdd9 Binary files /dev/null and b/docs/apps/images/annotation_engine_merge_csv.png differ diff --git a/docs/apps/images/annotation_engine_nc_controls.png b/docs/apps/images/annotation_engine_nc_controls.png new file mode 100644 index 0000000..bc3748a Binary files /dev/null and b/docs/apps/images/annotation_engine_nc_controls.png differ diff --git a/docs/apps/images/annotation_engine_nc_loaded.png b/docs/apps/images/annotation_engine_nc_loaded.png new file mode 100644 index 0000000..08e1eb3 Binary files /dev/null and b/docs/apps/images/annotation_engine_nc_loaded.png differ diff --git a/docs/apps/images/annotation_engine_nc_overview.png b/docs/apps/images/annotation_engine_nc_overview.png new file mode 100644 index 0000000..d9f2e33 Binary files /dev/null and b/docs/apps/images/annotation_engine_nc_overview.png differ diff --git a/docs/apps/images/annotation_engine_tif_correction.png b/docs/apps/images/annotation_engine_tif_correction.png new file mode 100644 index 0000000..2125d61 Binary files /dev/null and b/docs/apps/images/annotation_engine_tif_correction.png differ diff --git a/docs/apps/images/annotation_engine_tif_loaded.png b/docs/apps/images/annotation_engine_tif_loaded.png new file mode 100644 index 0000000..33c3a1c Binary files /dev/null and b/docs/apps/images/annotation_engine_tif_loaded.png differ diff --git a/docs/apps/images/annotation_engine_tif_overview.png b/docs/apps/images/annotation_engine_tif_overview.png new file mode 100644 index 0000000..9ab1c35 Binary files /dev/null and b/docs/apps/images/annotation_engine_tif_overview.png differ diff --git a/docs/apps/images/nc_builder_input_files.png b/docs/apps/images/nc_builder_input_files.png new file mode 100644 index 0000000..f1ede54 Binary files /dev/null and b/docs/apps/images/nc_builder_input_files.png differ diff --git a/docs/apps/images/nc_builder_level_detection.png b/docs/apps/images/nc_builder_level_detection.png new file mode 100644 index 0000000..2a902f5 Binary files /dev/null and b/docs/apps/images/nc_builder_level_detection.png differ diff --git a/docs/apps/images/nc_builder_output_settings.png b/docs/apps/images/nc_builder_output_settings.png new file mode 100644 index 0000000..4badf4f Binary files /dev/null and b/docs/apps/images/nc_builder_output_settings.png differ diff --git a/docs/apps/images/nc_builder_overview.png b/docs/apps/images/nc_builder_overview.png new file mode 100644 index 0000000..6369a16 Binary files /dev/null and b/docs/apps/images/nc_builder_overview.png differ diff --git a/docs/apps/images/nc_builder_spatial_subset.png b/docs/apps/images/nc_builder_spatial_subset.png new file mode 100644 index 0000000..798cbce Binary files /dev/null and b/docs/apps/images/nc_builder_spatial_subset.png differ diff --git a/docs/apps/images/nc_builder_status_panels.png b/docs/apps/images/nc_builder_status_panels.png new file mode 100644 index 0000000..484e5e3 Binary files /dev/null and b/docs/apps/images/nc_builder_status_panels.png differ diff --git a/docs/apps/images/nc_builder_time_detection.png b/docs/apps/images/nc_builder_time_detection.png new file mode 100644 index 0000000..ca9d123 Binary files /dev/null and b/docs/apps/images/nc_builder_time_detection.png differ diff --git a/docs/apps/images/nc_builder_time_subset.png b/docs/apps/images/nc_builder_time_subset.png new file mode 100644 index 0000000..0ee512f Binary files /dev/null and b/docs/apps/images/nc_builder_time_subset.png differ diff --git a/docs/apps/images/nc_builder_variables_coordinates.png b/docs/apps/images/nc_builder_variables_coordinates.png new file mode 100644 index 0000000..d1bfb3d Binary files /dev/null and b/docs/apps/images/nc_builder_variables_coordinates.png differ diff --git a/docs/apps/images/presence_actions.png b/docs/apps/images/presence_actions.png new file mode 100644 index 0000000..e686e5f Binary files /dev/null and b/docs/apps/images/presence_actions.png differ diff --git a/docs/apps/images/presence_apps_panel.png b/docs/apps/images/presence_apps_panel.png new file mode 100644 index 0000000..4e47943 Binary files /dev/null and b/docs/apps/images/presence_apps_panel.png differ diff --git a/docs/apps/images/presence_bounding_box.png b/docs/apps/images/presence_bounding_box.png new file mode 100644 index 0000000..dcb4706 Binary files /dev/null and b/docs/apps/images/presence_bounding_box.png differ diff --git a/docs/apps/images/presence_derived_metric_filters.png b/docs/apps/images/presence_derived_metric_filters.png new file mode 100644 index 0000000..2184076 Binary files /dev/null and b/docs/apps/images/presence_derived_metric_filters.png differ diff --git a/docs/apps/images/presence_input_files.png b/docs/apps/images/presence_input_files.png new file mode 100644 index 0000000..521503a Binary files /dev/null and b/docs/apps/images/presence_input_files.png differ diff --git a/docs/apps/images/presence_output_settings.png b/docs/apps/images/presence_output_settings.png new file mode 100644 index 0000000..103dd7e Binary files /dev/null and b/docs/apps/images/presence_output_settings.png differ diff --git a/docs/apps/images/presence_region_polygon.png b/docs/apps/images/presence_region_polygon.png new file mode 100644 index 0000000..6722372 Binary files /dev/null and b/docs/apps/images/presence_region_polygon.png differ diff --git a/docs/apps/images/presence_time_spatial_aggregation.png b/docs/apps/images/presence_time_spatial_aggregation.png new file mode 100644 index 0000000..80bf3f2 Binary files /dev/null and b/docs/apps/images/presence_time_spatial_aggregation.png differ diff --git a/docs/apps/images/presence_vetting_filters.png b/docs/apps/images/presence_vetting_filters.png new file mode 100644 index 0000000..e36131f Binary files /dev/null and b/docs/apps/images/presence_vetting_filters.png differ diff --git a/docs/apps/user_guide/annotation_engine.md b/docs/apps/user_guide/annotation_engine.md new file mode 100644 index 0000000..1d1ff27 --- /dev/null +++ b/docs/apps/user_guide/annotation_engine.md @@ -0,0 +1,361 @@ +# Annotation Engine + +## Overview + +### App features + +With the Annotation Engine App, you can + +- Annotate movement data with environmental variables from gridded environmental datasets. +- Load movement data and environmental data from supported formats such as NetCDF and GeoTIFF. +- Select environmental variables for annotation. +- Match movement records with environmental values by location and time. +- Use different annotation approaches for continuous variables and categorical or quality-control variables. +- Apply spatial and temporal matching or interpolation methods where supported. +- Optionally apply scale factor and offset corrections to continuous variables. +- Export annotated movement data for further analysis or visualization. + +### Using the app + +1. If you haven't already, prepare a local movement data file and the environmental datasets you want to use for annotation. +2. Launch the Annotation Engine App. +3. Select the movement data file. The file should contain location and time information compatible with the ECODATA movement data format. +4. Load the environmental dataset or datasets. Depending on the workflow, these may be NetCDF or GeoTIFF files. +5. Select the environmental variables that should be added to the movement records. +6. Specify whether selected variables should be treated as continuous variables or categorical / quality-control variables. +7. Select the annotation method and, if available, the spatial or temporal interpolation options. +8. If using continuous variables with scale factor or offset values, set these options before running the annotation. +9. Run the annotation process. +10. Review the status messages and save the annotated movement data file. + +## Annotation Engine App Manual + +The Annotation Engine App contains four working tabs: Annotation engine - .nc, Annotation engine - .tif, Crop & interpolate csv, and Merge csv. The first two tabs annotate Movebank-style movement data with environmental variables. Crop & interpolate csv prepares movement CSV files before annotation. Merge csv merges CSV outputs into one file. + +## Tab #1: Annotation engine - .nc + +![View of the user interface: Annotation engine - .nc tab.](../images/annotation_engine_nc_overview.png) + +### Purpose + +The Annotation engine - .nc tab annotates animal movement tracks from a Movebank-style CSV file using environmental variables from one NetCDF (.nc) file. Each movement point is matched by timestamp, latitude, and longitude to selected environmental variables. + +The current version supports separate handling of continuous variables and categorical/QC variables. Continuous variables can be interpolated, while categorical flags, quality-control layers, land-cover classes, and similar data remains as raw codes. + +- Spatial filtering by an optional vector boundary (.shp or .geojson). +- Automatic boundary from the NetCDF extent if no vector boundary is selected. +- Manual selection of the Time, Latitude, and Longitude coordinate variables from the NetCDF file. +- Support for 3D variables with dimensions such as time/lat/lon and for common vertical level dimensions. If a variable contains pressure/vertical levels, the app exposes labels such as `variable_1000`, `variable_975`, etc. +- Saving one main annotated CSV and separate per-individual CSV files. + +### Overview + +#### Step 1. Load environmental data (.nc) + +- Select one NetCDF file in the Environmental data (.nc) selector. +- Press Load environmental data. +- The app reads the file name, time range, spatial range, available variables, and candidate coordinate variables. +- Check the Time variable, Latitude variable, and Longitude variable selectors. Adjust them manually if the automatic choice is not correct. +- Choose environmental variables in the Continuous selector and/or in the Categorical/QC selector. The same variable cannot be used as both continuous and categorical/QC. + +![Annotation engine - .nc tab after environmental data, movement data, and boundary data are loaded.](../images/annotation_engine_nc_loaded.png) + +#### Step 2. Load movement data (.csv) + +- Select a Movebank-style CSV file in the Movebank data (.csv) selector. +- Press Load movement data. +- The app extracts taxon names, individual IDs, the movement time range, and the movement spatial range. +- Optionally filter by Select Taxon, then select one or more IDs in Select ID. + +#### Step 2a. Configure annotation options + +![Bottom/control part of the Annotation engine - .nc tab.](../images/annotation_engine_nc_controls.png) + +| Option | Meaning in the current app | +| --- | --- | +| Continuous variables | Numeric environmental variables that may be spatially interpolated and linearly interpolated in time. | +| Categorical/QC variables | Class, flag, mask, quality control, land cover or similar variables that are sampled by nearest spatial grid cell and nearest available timestep. | +| Time / Latitude / Longitude variable | NetCDF coordinate mapping used to standardize the dataset before annotation (some files might use names such as `valid_time`, `y`, `x`, `latitude`, or `longitude`). | +| Interpolation method (spatial) | Nearest neighbor or Inverse Distance Weighting (IDW). The selected method applies to continuous variables. Categorical/QC variables always use nearest-neighbor logic. | +| Number of nearest grid points | Used for IDW. Available values are 2, 4, 6, and 8. For Nearest neighbor, the value is forced to 1 internally. | +| Output path | Path to the main annotated CSV. The default is in the user Downloads folder. | + +#### Interpolation behavior + +| Variable type | Nearest neighbor selected | IDW selected | +| --- | --- | --- | +| Continuous | Nearest spatial grid cell + linear interpolation in time. | k nearest spatial grid cells + linear interpolation in time for each grid cell + IDW spatial weighting. | +| Categorical/QC | Nearest spatial grid cell + nearest available timestep. | Still uses nearest spatial grid cell + nearest available timestep. It is not IDW-averaged and not linearly interpolated in time. | + +#### Step 3. Load boundary data (optional) + +- Select a .shp or .geojson boundary in the Boundary data (.shp/.geojson) selector. +- Press Load boundary data. +- The app displays the boundary file name and spatial range. +- If no boundary is selected, the NetCDF spatial extent is used automatically during annotation. +- Use (!) Reset boundary to return to the automatic NetCDF boundary. + +#### Step 4. Run annotation + +- Press Make annotated file in the 4. Start annotation block. +- The app filters movement records by selected ID(s). +- The app applies the selected boundary. If no boundary is loaded, it uses the NetCDF extent. +- The app prefilters movement points to the union time window of the selected NetCDF variables. +- The app annotates each point with the selected continuous and categorical/QC variables using the rules described above. +- The app saves the main CSV to Output path and creates an `annotated_individuals` folder next to it. + +### Output scenarios + +- Boundary not loaded: automatic boundary from the NetCDF extent is used. +- Boundary loaded: only movement points inside the polygon are kept. +- Continuous + Nearest neighbour. +- Continuous + IDW. +- Categorical/QC variables: nearest spatial cell and nearest timestep, regardless of the selected interpolation method for continuous variables. +- Movement points outside the NetCDF spatial or temporal coverage are removed by filtering. + +### Expected results + +- Main annotated CSV saved to the selected Output path. +- A subfolder named `annotated_individuals` containing one annotated CSV per individual ID, if the individual ID column is available. + +## Tab #2: Annotation engine - .tif + +![View of the user interface: Annotation engine - .tif tab.](../images/annotation_engine_tif_overview.png) + +### Purpose + +The Annotation engine - .tif tab annotates movement tracks using environmental data distributed as AppEEARS-style GeoTIFF files. The user selects one sample .tif file from the target folder, and the app processes all .tif files in that same folder. + +The app converts the TIF stack into one temporary NetCDF file before annotation. In the current version, this temporary NetCDF is saved in the same folder as the input TIF files. Movebank data is not required for the TIF-to-NetCDF loading/conversion step, but it is required when the annotation is actually run. + +- All .tif files in the selected folder are collected and converted to one NetCDF file. +- Only variables with at least three dimensions, typically time/lat/lon, are made available for annotation. +- Variables are split into Continuous variables and Categorical/QC variables. +- The app makes an initial guess of variable type from variable names, but the user should check and adjust this split manually. +- Optional post-sampling scale factor and add offset can be applied to continuous variables only. + +### Overview + +#### Step 1. Load TIF environmental data + +- Select any single .tif file inside the folder that contains the full TIF time series. +- Press Load TIF environmental data. +- The app collects all .tif files in the same folder. +- The app converts the TIF stack to one NetCDF file in the TIF folder. +- The app displays the generated NetCDF file name, time range, spatial range, and detected variables. +- Review the Continuous variables and Categorical/QC variables lists and correct them if necessary. + +![Annotation engine - .tif tab after data selection.](../images/annotation_engine_tif_loaded.png) + +Note: The TIF-to-NetCDF conversion keeps raw raster values. Scale factor and offset are not applied during conversion. Optional correction is applied later, after sampling, and only to continuous variables. + +#### Step 2. Load movement data (.csv) + +- Select a movement data CSV file with Movebank-compatible column names. +- Press Load movement data. +- The app extracts taxon names, individual IDs, time range, and spatial range. +- Select the required taxon(s) and individual ID(s). At least one individual ID must be selected before running TIF annotation. + +#### Step 2a. Configure interpolation and value correction + +![Interpolation and value correction control in Annotation engine - .tif tab.](../images/annotation_engine_tif_correction.png) + +| Option | Meaning in the current app | +| --- | --- | +| Continuous variables | Variables that can be spatially interpolated and linearly interpolated in time. Optional scale/offset correction can be applied to these variables after sampling. | +| Categorical/QC variables | Variables that should remain raw category, flag, or QC codes. They are sampled by nearest grid cell and nearest timestep only. | +| Interpolation method (spatial) | Nearest neighbour or IDW for continuous variables. Categorical/QC variables are not IDW-averaged. | +| Number of nearest grid points | Controls the number of grid cells used for IDW for continuous variables. | +| Apply scale factor / offset | Enables post-sampling correction for continuous variables only. | +| Scale factor | Multiplier in `corrected_value = sampled_value * scale_factor`. | +| Add offset | Additive offset in `corrected_value = sampled_value * scale_factor + add_offset`. | +| Output path | Path to the main annotated CSV. The default is in the user Downloads folder. | + +#### Step 3. Load boundary data (optional) + +- Select a .shp or .geojson boundary file if spatial filtering by polygon is required. +- Press Load boundary data. +- If no boundary is selected, the extent of the generated NetCDF is used automatically. +- Use (!) Reset boundary to return to the automatic NetCDF boundary. + +#### Step 4. Run annotation + +- Press Make annotated file in the 4. Start annotation block. +- The app verifies that Movebank data, a sample TIF file, selected ID(s), and selected environmental variables are available. +- The app reconverts or confirms the TIF stack as a NetCDF file in the TIF folder. +- The app builds the environmental variable map from valid time/lat/lon variables. +- The app applies the selected boundary or the automatic NetCDF extent. +- Continuous variables are annotated using the selected spatial interpolation method and linear temporal interpolation. +- Categorical/QC variables are annotated using nearest spatial grid cell and nearest available timestep. +- If enabled, scale factor and add offset are applied after sampling to continuous variables only. +- The main annotated CSV and per-individual CSV files are saved. + +### Output scenarios + +- Boundary not loaded: automatic boundary from the generated NetCDF extent is used. +- Boundary loaded: only movement points inside the polygon are used. +- Continuous + Nearest neighbour. +- Continuous + IDW. +- Categorical/QC variables: never IDW-averaged, never time-linearly interpolated, and never scaled/offset. +- Scale/offset disabled: raw sampled values are saved. +- Scale/offset enabled: corrected continuous values are saved using `corrected_value = sampled_value * scale_factor + add_offset`. + +### Expected results + +- Main annotated CSV saved to Output path. +- A subfolder named `annotated_individuals` containing one annotated CSV per individual ID. +- A generated temporary NetCDF file stored in the same folder as the source TIF files. + +## Tab #3: Crop & Interpolate CSV + +![View of the user interface: Crop & Interpolate CSV tab.](../images/annotation_engine_crop_top.png) + +### Purpose + +The Crop & Interpolate CSV tab is designed to pre-process Movebank CSV datasets by: + +- Subsetting by taxon or individual IDs. +- Cropping to a specific time range. +- Interpolating missing values with a regular time grid. +- Optionally averaging numeric variables over fixed intervals. +- Splitting sessions based on deployment gaps. +- Producing per-individual and merged CSV outputs. + +### Overview + +#### Step 1. Load Input Data + +1. Local CSV file + + - Select a Movebank-style CSV file. + - Press Load data. + - Taxons and IDs are extracted. + - Time coverage is shown in the slider. + +2. Select Taxons and IDs + + - Use Taxon name to filter IDs. + - Choose one or more Individual IDs. + +![Bottom part of the Crop & Interpolate CSV tab.](../images/annotation_engine_crop_controls.png) + +#### Step 2. Configure Options + +Simple interpolation (missing <= 1 day) + +- Fills only short gaps (up to 1 day). +- Creates CSVs with interpolated values only for eligible gaps. + +Advanced interpolation ("Make CSV" button) + +1. Deployment time gap (minutes) + + - Splits sessions if there is a gap larger than this threshold. + - Prevents mixing different tracking deployments. + +2. Minimum expected observations + + - Skips sessions with too few points (e.g., <100). + +3. Time range + + - Set with the slider. + - Crops data to the chosen period. + +4. Interpolation/Averaging parameters + + - Timestep (minutes): + - If =1: interpolation only. + - If >1: averaging over intervals. + - First timestamp = 00:00:00: + - Optionally truncate sessions to start at midnight. + +5. Output CSV name + + - Base name for generated files. + - Example: `subset.csv` creates `subset_.csv`. + +6. Merge & cleanup options + + - Merge files after processing: combine all per-ID files. + - Delete individual files after merge: keep only the merged dataset. + +#### Step 3. Run Processing + +Press Make CSV. The app: + +1. Filters data by ID and time. +2. Splits into sessions using `deployment_time_gap`. +3. Performs 1-minute interpolation (always). +4. Performs optional averaging at user timestep. +5. Writes per-session CSVs (`___to_.csv`). +6. Creates per-ID CSVs. +7. Optionally merges files into a single dataset. + +### Output scenarios + +- Simple interpolation only: per-ID CSVs with gaps <= 1 day filled. +- Make CSV with timestep = 1: per-ID datasets interpolated to 1-min intervals. +- Make CSV with timestep > 1: per-ID datasets averaged (e.g., 30 min). +- Merge enabled: single combined CSV created. +- Delete after merge enabled: keeps only merged file. + +### Expected results + +You will obtain: + +1. Per-ID CSVs (subsetted and interpolated). +2. Session-specific files (`__start_to_end.csv`). +3. Optional merged dataset. +4. Optional interpolated files (`__interp_inplace_le1d.csv`) when using Simple interpolation. + +## Tab #4: Merge CSV + +![View of the user interface: Merge CSV tab.](../images/annotation_engine_merge_csv.png) + +### Purpose + +The Merge CSV tab is designed to combine multiple CSV files (from previous processing or annotation) into a single dataset. + +It ensures consistency of column names, optionally removes non-overlapping columns, and provides one merged file for further analysis. + +### Overview + +#### Step 1. Select Folder + +- Enter or browse to a folder containing multiple .csv files. +- All files in the folder will be processed. + +#### Step 2. Configure Options + +- Delete empty columns after merging + - If enabled: columns missing in at least one file are dropped. + - If disabled: all columns are preserved; missing values filled with NaN. +- Output merged CSV name + - Path and filename for the merged file (default: `merged.csv`). + +#### Step 3. Run Merge + +Press Merge files in folder. The app: + +- Collects all .csv files from the folder. +- Normalizes column names. +- Concatenates data into one DataFrame. +- Drops non-shared columns if the option is enabled. +- Saves the merged file to the defined path. + +### Output scenarios + +- Delete empty columns enabled + - Cleaner file with only shared columns. + - Some information may be lost. +- Delete empty columns disabled + - All columns preserved. + - Resulting CSV may contain NaN values in places where a column was missing in some files. + +### Expected results + +After successful processing, you will obtain: + +1. One merged CSV saved at the chosen path. +2. A message in the app showing the path of the merged file and the list of columns that were removed, if any. diff --git a/docs/apps/user_guide/index.md b/docs/apps/user_guide/index.md index a83be3f..c07f77f 100644 --- a/docs/apps/user_guide/index.md +++ b/docs/apps/user_guide/index.md @@ -14,5 +14,9 @@ tracks_explorer gridded_data_explorer subsetter movie_maker +annotation_engine +presence_data_preparation +nc_builder +multidimensional_annotation ``` \ No newline at end of file diff --git a/docs/apps/user_guide/multidimensional_annotation.md b/docs/apps/user_guide/multidimensional_annotation.md new file mode 100644 index 0000000..f85f4c1 --- /dev/null +++ b/docs/apps/user_guide/multidimensional_annotation.md @@ -0,0 +1,23 @@ +# Multidimensional Annotation + +## App features + +With the Multidimensional Annotation App, you can +- Annotate movement data with environmental variables from multidimensional NetCDF datasets. +- Work with environmental data that include time, latitude, longitude, and vertical dimensions such as pressure level, height, or model level. +- Load movement records and match them with selected environmental variables. +- Select coordinate variables and environmental variables from NetCDF files. +- Use multidimensional environmental datasets for advanced annotation workflows. +- Export movement data enriched with selected environmental values. + +## Using the app + +1. If you haven't already, prepare a local movement data file and the NetCDF environmental files you want to use for annotation. +2. Launch the Multidimensional Annotation App. +3. Select the movement data file. The file should contain location and time information compatible with the ECODATA movement data format. +4. Load the NetCDF environmental dataset or datasets. +5. Select the coordinate variables used by the environmental file, such as time, latitude, longitude, and vertical level. +6. Select the environmental variables that should be extracted for the movement records. +7. Check that the spatial, temporal, and vertical coverage of the environmental data matches the movement data. +8. Run the annotation process. +9. Review the status messages and save the annotated output file. \ No newline at end of file diff --git a/docs/apps/user_guide/nc_builder.md b/docs/apps/user_guide/nc_builder.md new file mode 100644 index 0000000..ce32bf1 --- /dev/null +++ b/docs/apps/user_guide/nc_builder.md @@ -0,0 +1,289 @@ +# NC Builder + +## Overview + +### App features + +With the NC Builder App, you can + +- Prepare and standardize NetCDF files for use in ECODATA annotation workflows. +- Load one or more NetCDF files from local folders. +- Inspect available variables, coordinates, dimensions, and time information. +- Select target variables and assign standard coordinate roles such as time, latitude, longitude, and vertical level. +- Combine files by time, by level, or by both time and level, depending on the structure of the source data. +- Optionally apply spatial and temporal subsetting. +- Export a standardized NetCDF file that can be used by ECODATA annotation apps. + +### Using the app + +1. If you haven't already, prepare the NetCDF files that need to be combined or standardized. +2. Launch the NC Builder App. +3. Select the input folder or input files containing the NetCDF data. +4. Choose the combine mode, such as combining by time, by vertical level, or by both time and level. +5. Inspect the detected variables and coordinates. +6. Select the target variable and assign the correct coordinate fields for time, latitude, longitude, and, if needed, vertical level. +7. If needed, set spatial or temporal subset options. +8. Specify the output file name and location. +9. Click the build button to create the standardized NetCDF file. +10. Review the status messages and check the output file before using it in annotation workflows. + +## NetCDF Builder App User Manual + +This manual describes the current workflow and interface of the NC Builder App in Ecodata Prepare!. The app prepares standardized CF-style NetCDF files from multiple ERA5 or other NetCDF inputs before further processing, including multilevel annotation which can be made in Multidimensional Annotation App. + +![View of the user interface of NetCDF Builder App.](../images/nc_builder_overview.png) + +## Purpose + +The NetCDF Builder App is designed to combine, standardize, and optionally subset multiple NetCDF files into a single output file with predictable coordinate names and basic CF-style metadata. + +The app is useful when environmental datasets are stored as separate files by time, by vertical level, or by both time and level. It can be used as a preparation step before annotation, 3D annotation, visualization, or other ECODATA workflows that expect harmonized NetCDF coordinates. + +The module supports: + +- Selecting a local folder and loading supported NetCDF files from that folder. +- Choosing which files from the folder should be included in scan, validation, and build steps. +- Combining files by time, by vertical level, or by both time and vertical level. +- Selecting one or more target variables for the output NetCDF file. +- Mapping source coordinate names to standard output names: time, lat, lon, and level. +- Detecting time and level information from NetCDF coordinates, filenames, or manual tables. +- Optionally applying a geographic bounding box subset. +- Optionally applying a time subset in modes where this is supported. +- Writing a single standardized NetCDF file and an accompanying manifest JSON file. + +## Overview + +### Step 1. Select Input Files + +![Input section: 1. Input files.](../images/nc_builder_input_files.png) + +Use the section "1. Input files" to define the source folder and select the files that should be processed. + +1. Input folder + + Select the folder that contains the NetCDF files using the file selector labelled "Input folder". The selector is used to define the folder. If a file inside the folder is selected, the app uses its parent folder. + +2. Load file list + + Press "Load file list" to scan the selected folder for supported NetCDF-like files. Supported extensions are `.nc`, `.nc4`, `.cdf`, and `.netcdf`. + +3. Select files from current folder + + After loading the file list, the widget "Select files from current folder" is populated with all supported files found in the folder. All files are selected by default. Deselect files that should not be included in the current build. + +4. Combine mode + + Choose how the selected files should be combined: + + - By time: files are combined along the time dimension. In this mode, the selected files define the time range, and the Start time / End time widgets are disabled. + - By level: files are combined along the vertical level dimension. + - By time and level: files are combined using both time and level coordinates when possible. + +5. Scan variables + + Press "Scan variables" after selecting the input files. The app scans selected files, detects available variables, coordinates, dimensions, and suggested coordinate names, and updates the Preview panel. + +### Step 2. Configure Variables and Coordinates & time + +![Variables, coordinates and time section.](../images/nc_builder_variables_coordinates.png) + +Use this section to define which variable or variables will be written to the output file and how source coordinate names should be standardized. + +Target variable(s) + +Select one or more variables from the scanned source files. In single-variable case, the selected variable can be renamed using "Output variable name". In multi-variable mode, the app preserves the original variable names. + +Coordinate mapping + +- Time variable. +- Latitude variable. +- Longitude variable. +- Vertical / level variable: source variable or coordinate that represents vertical level. Use "None" if a level coordinate is not required. + +Output coordinate names + +The standardized output coordinate names are `time`, `lat`, `lon`, and `level`. The app writes basic CF-style coordinate attributes. For latitude and longitude, it uses `degrees_north` and `degrees_east`. For pressure levels, it can write `air_pressure` metadata. + +Output variable name + +In single-variable mode, this field defines the name of the data variable in the output NetCDF file. If the field is empty after scanning, the app usually fills it with the selected source variable name. + +Output level coordinate name and Level units + +The field "Output level coordinate name" is shown as a user setting, with the default value `level`. The current version standardizes the actual level coordinate to `level`. Use "Level units" to define whether the vertical coordinate is `hPa`, `m`, `Pa`, `model_level`, or `custom`. If `custom` is selected, enter the units in "Custom level units". + +### Step 3. Configure Level Detection + +![Level detection section: 3. Level detection.](../images/nc_builder_level_detection.png) + +Use "Level source" to define how vertical level values should be obtained. + +- From NetCDF coordinate: use the selected "Vertical / level variable" from the input files. +- From filename: extract the level value from each filename using "Level regex". The default pattern is `level(\d+)`. +- Manual table: in "Level table file" paste path to the CSV table to read level values. The CSV should contain columns named `name` and `level`. The `name` value should match the input file name or a unique part of it. + +### Step 4. Configure Time Detection + +![Time detection section: 4. Time detection.](../images/nc_builder_time_detection.png) + +Use "Time source" to define how time values should be obtained. + +- From NetCDF time coordinate: use the selected "Time variable" from the source files. +- From filename: extract the date or date-time value from each filename using "Time regex" and parse it using "Time format". The default pattern is `(\d{8})` and the default format is `%Y%m%d`. +- Manual table: read time values from a CSV table specified in "Time table file". In this case the CSV should contain columns named `name` and `DateTime` (example: `2026-01-01 00:00:00`). + +### Step 5. Configure Spatial Subset + +![Spatial subset section: 5. Spatial subset.](../images/nc_builder_spatial_subset.png) + +Use "Bounding box" if the output file should be cropped spatially. If the checkbox is not enabled, the original spatial extent is preserved. + +When Bounding box is enabled, enter: + +- South: minimum latitude. +- North: maximum latitude. +- West: minimum longitude. +- East: maximum longitude. + +The app expects latitude and longitude coordinates that can be subset as one-dimensional lat/lon coordinates. If the requested bounding box does not overlap the file grid, the build step stops with an error. + +### Step 6. Configure Time Subset + +![Time subset section: 6. Time subset.](../images/nc_builder_time_subset.png) + +After scanning, the app displays "Detected time range" if time can be read from the scanned files. The fields "Start time" and "End time" define an optional time subset. + +In "By time" mode, time subsetting is disabled. In that case, select the required files in "Select files from current folder" instead. This avoids unsafe time slicing for files that use non-standard calendars such as Julian or 360_day calendars. + +In "By level" and "By time and level" modes, the app applies the time subset when a standard pandas-compatible time coordinate is available. For cftime calendars, the current implementation skips time slicing. + +### Step 7. Define Output Settings and Build + +![Output settings and action buttons: 7. Output settings.](../images/nc_builder_output_settings.png) + +Use "7. Output settings" to define how the output file is written. + +- Output folder: folder where the generated NetCDF and manifest JSON files will be saved. The default is the Windows user Downloads folder. +- Output filename: name of the output NetCDF file. The default initial value is `era5_standardized_temperature.nc`. After scanning, the app may change it to `standardized_.nc` or `standardized_multivariable.nc`. +- Output mode: currently fixed to Single NetCDF file. +- Use chunking when reading: enables chunked reading through xarray/dask where available. +- Chunking mode: auto or manual. Manual mode enables time chunk, level chunk, lat chunk, and lon chunk fields. +- Enable NetCDF compression: writes compressed data variables using zlib compression when supported by the selected NetCDF writer. + +Validation and build actions + +- Validate: checks whether the selected settings are sufficient for the build step. It reports missing variables, invalid bounding-box values, missing time or level definitions, and output filename warnings. +- Build standardized NetCDF: starts the build process, writes the output NetCDF file, writes the manifest JSON file, and updates the Preview and Log panels. + +### Step 8. Read Preview, Validation, and Log Panels + +![Status panels: Preview, Validation, and Log.](../images/nc_builder_status_panels.png) + +The lower part of the app contains three status panels. + +- Preview: shows loaded files, scan results, detected variables, coordinates, dimensions, selected target variables, selected coordinate mappings, build results, and output paths. +- Validation: shows whether the current configuration is ready for building or lists issues that must be fixed. +- Log: records user actions and backend messages, including scan status, validation status, build start, build completion, and build errors. + +## Output Files + +The app creates one standardized NetCDF file and one manifest JSON file. The manifest file is saved next to the NetCDF file and uses the suffix `.manifest.json`. + +### Standardized NetCDF File + +The output NetCDF file contains: + +- Selected target data variable or variables. +- Standardized coordinate names: `time`, `lat`, `lon`, and optionally `level`. +- CF-style coordinate attributes. +- Global attributes such as `title`, `Conventions`, `history`, `source_files_count`, and `combine_mode`. +- Longitude values converted to the -180..180 convention when possible. +- Optional MATLAB/MODIS-compatible time encoding: days since 2000-01-01 with julian calendar, when time is present. + +### Manifest JSON File + +The manifest records: + +- `output_path` and `manifest_path` +- `processed_files` +- `engine_by_file` +- full build configuration +- validation warnings +- output dimensions +- output variables +- output coordinates +- time encoding information, when time is present + +## Output Scenarios + +Scenario 1. Combine mode = By time + +Use this when separate files represent different times for the same variable and grid. The selected files define the time range. Start time and End time are disabled in this mode. + +Scenario 2. Combine mode = By level + +Use this when separate files represent different vertical levels for the same time or time range. Level values can be read from a NetCDF coordinate, extracted from filenames, or supplied by a manual table. + +Scenario 3. Combine mode = By time and level + +Use this when the source collection needs to be organized across both time and vertical levels. The app first standardizes each selected file and then combines datasets by coordinates when possible. + +Scenario 4. Single target variable selected + +The output variable can be renamed using "Output variable name". This is useful when source variable names are short or inconsistent. + +Scenario 5. Multiple target variables selected + +The output file keeps the original source variable names. + +Scenario 6. Bounding box enabled + +The output file is cropped to the selected South, North, West, and East limits for reducing file size. + +Scenario 7. Bounding box disabled + +The output file preserves the original spatial extent of the input files. + +## Expected Results + +After processing, you will obtain: + +1. A standardized NetCDF file in the selected output folder. +2. A manifest JSON file next to the NetCDF file. +3. A Preview panel summary with output path, manifest path, output dimensions, output variables, and output coordinates. +4. A Log panel entry confirming that the build is complete. + +## Processing Logic + +When "Build standardized NetCDF" is pressed, the backend performs the following operations: + +1. Validates that the selected files exist and that the required variables and coordinates are defined. +2. Loads manual time or level tables when Manual table mode is selected. +3. Opens each selected NetCDF file. +4. Selects the target variable or variables. +5. Renames source latitude and longitude coordinates to `lat` and `lon`. +6. Renames the selected time coordinate to `time` when time is read from NetCDF coordinates. +7. Renames the selected level coordinate to `level` when level is read from NetCDF coordinates. +8. Adds time or level dimensions from filenames or manual tables when needed. +9. Converts 0..360 longitudes to the -180..180 convention when possible. +10. Sorts common coordinates such as `time`, `level`, `lat`, and `lon`. +11. Applies the bounding-box subset when enabled. +12. Applies the time subset in supported combine modes and calendar cases. +13. Checks grid compatibility between datasets. +14. Combines datasets by coordinates or by the selected combine dimension. +15. Applies basic CF-style metadata. +16. Writes the standardized NetCDF file. +17. Writes the manifest JSON file. + +## Notes and Limitations + +- Input files must be readable by xarray using one of the supported NetCDF engines. +- The selected files should have compatible latitude and longitude grids. If grids differ, the build step can fail with a grid compatibility error. +- Bounding-box subsetting currently expects one-dimensional latitude and longitude coordinates. +- In "By time" mode, Start time and End time are disabled. Select the required files manually instead. +- For cftime calendars, preview can show the time range as strings, but time slicing may be skipped during build. +- The app writes the actual standardized level coordinate as `level`. The displayed "Output level coordinate name" setting is currently not used to rename the backend coordinate away from `level`. +- Manual time tables must contain `name` and `DateTime` columns. Manual level tables must contain `name` and `level` columns. +- When multiple target variables are selected, source variable names are preserved. +- The manifest is intended for reproducibility and troubleshooting. It should be kept together with the generated NetCDF file. +- The output file is standardized for usage in ECODATA apps but should still be checked scientifically before use in final analyses. diff --git a/docs/apps/user_guide/presence_data_preparation.md b/docs/apps/user_guide/presence_data_preparation.md new file mode 100644 index 0000000..4f76f69 --- /dev/null +++ b/docs/apps/user_guide/presence_data_preparation.md @@ -0,0 +1,416 @@ +# Presence Data Preparation + +## Overview + +### App features + +With the Presence Data Preparation App, you can + +- Prepare species occurrence or presence-type observation data for further visualization and analysis in ECODATA. +- Load observation data and, when available, associated sampling or effort data. +- Filter records by selected temporal, spatial, taxonomic, and data-quality criteria. +- Aggregate observations by user-defined time intervals. +- Optionally aggregate observations to a regular spatial grid. +- Calculate basic presence and observation-effort metrics. +- Export prepared tables and tracks-style files for further use in ECODATA visualization workflows. + +### Using the app + +1. If you haven't already, prepare local copies of the observation data and, if needed, the associated sampling or effort file. +2. Launch the Presence Data Preparation App. +3. Under the input file options, select the observation data file. If the workflow requires sampling or effort information, also select the corresponding sampling file. +4. Select the required filtering options, such as date range, species, review status, and spatial limits. +5. If needed, enable spatial grid aggregation and set the grid step. +6. Select the time aggregation interval. The app groups observations into fixed time windows based on the selected number of days. +7. Click the processing button to run the aggregation. +8. Review the status messages and generated output information. +9. Save the output files. The app can create aggregated count data, aggregated presence data, presence points, and tracks-style files for further ECODATA visualization. + +## Presence Data Preparation App User Manual + +This manual describes the current workflow and interface labels of the Presence Data Preparation App in Ecodata Prepare!. The app prepares animal presence or observation datasets with an eBird-compatible column structure for analysis and visualization in ECODATA-Animate. + +![View of the list of Ecodata Prepare Apps with Presence Data Preparation App.](../images/presence_apps_panel.png) + +## Purpose + +The Presence Data Preparation App is designed to prepare animal presence or observation datasets for further visualization and analysis in ECODATA-Animate. + +The app is intended for tabular datasets that follow an eBird-compatible column structure. The input data do not necessarily have to come directly from eBird, but they must contain the required fields used by the app, such as observation coordinates, observation date, species name, observation count, and sampling event identifier. + +The module supports: + +- Selecting large local observation and sampling-event files. +- Combining observation records with sampling-event metadata. +- Filtering observations by spatial region, time range, protocol type, checklist quality, effort, and coordinate validity. +- Aggregating observations by fixed time intervals. +- Optionally aggregating observations to a regular geographic grid. +- Calculating count, presence, sampling-support, and effort-standardized metrics. +- Exporting a presence points file for ECODATA-Animate. + +## Overview + +### Step 1. Select Input Data + +![Input section: 1. Inputs.](../images/presence_input_files.png) + +#### Observation Data: EBD file + +Select the main observation table using the file selector labelled "EBD local path". + +The file should contain animal observation records in an eBird-compatible structure. + +Required columns include: + +- sampling event identifier +- latitude +- longitude +- observation date +- scientific name +- common name +- observation count + +Optional but useful columns include: + +- time observations started +- observation type +- protocol code +- reviewed +- approved + +#### Sampling Event Data + +Select the sampling-event table using the file selector labelled "Sampling local path". + +Required column: + +- sampling event identifier + +Useful optional columns include: + +- all species reported +- protocol type +- protocol name +- duration minutes +- effort distance km +- number observers + +The app joins the observation table and sampling-event table using SAMPLING EVENT IDENTIFIER. + +### Step 2. Configure Spatial Subset + +Choose the spatial filtering mode in the "Spatial filter" control. Two modes are available: "Region polygon" and "Bounding box". + +![Spatial section: 2. Spatial subset, Region polygon tab.](../images/presence_region_polygon.png) + +#### Region polygon + +Select a polygon file using the file selector labelled "Region polygon local path (shapefile or GeoJSON)". + +Supported inputs include GeoJSON/JSON files and zipped shapefiles. Only observations located inside the selected polygon are used. + +![Spatial section: 2. Spatial subset, Bounding box tab.](../images/presence_bounding_box.png) + +#### Bounding box + +Enter the following coordinates manually: + +- West / min longitude +- South / min latitude +- East / max longitude +- North / max latitude + +Coordinates must be in EPSG:4326 geographic coordinates. Longitude must be between -180 and 180, latitude must be between -90 and 90, west must be smaller than east, and south must be smaller than north. + +### Step 3. Define Output Settings + +![Output section: 3. Outputs.](../images/presence_output_settings.png) + +Use "Output folder" to define where the generated files will be saved. By default, the app uses the Windows user Downloads folder, for example `C:\Users\\Downloads`. + +Use "Run name" to define the prefix for generated output files. The default run name is: + +- `presence_run` + +For this default run name, the app creates files such as: + +- `presence_run__agg_counts.csv` +- `presence_run__agg_presence.csv` +- `presence_run__manifest.json` +- `presence_run__presence_points.csv` + +### Step 4. Configure Vetting / Filtering + +![Vetting section: 4. Vetting / filtering.](../images/presence_vetting_filters.png) + +The app provides several optional filters to remove unsuitable or low-quality records before aggregation. + +Checklist and review filters + +- REVIEWED: if enabled, only records marked as reviewed are kept when this column is available. +- APPROVED: if enabled, only records marked as approved are kept when this column is available. +- ALL SPECIES REPORTED: if enabled, only complete checklists are kept when this column is available. +- Exclude incidental/historical: if enabled, incidental and historical protocols are removed when protocol information is available. +- Require valid coordinates: removes records with missing or invalid latitude/longitude values. + +Protocol and count filters + +- Allowed protocols (optional): selects which protocol types should be included. Typical options are Traveling, Stationary, Area, Incidental, and Historical. +- Clip extreme counts above (0=off): optionally limits unusually high observation counts. A value of 0 disables clipping. + +Effort filters + +- Min duration (minutes) and Max duration (minutes): filter records based on DURATION MINUTES when available. +- Min distance (km) and Max distance (km): filter records based on EFFORT DISTANCE KM when available. + +### Step 5. Configure Time and Spatial Aggregation + +![Time and spatial aggregation section: 5. Time and spatial aggregation.](../images/presence_time_spatial_aggregation.png) + +- Start date: first date included in processing. +- End date: last date included in processing. +- Aggregation step (days): size of the temporal aggregation window in days. +- Grid step (degrees, 0 = use original coordinates): spatial aggregation setting. + +Examples of aggregation step values: + +- 1 = daily aggregation. +- 7 = fixed 7-day aggregation. +- 30 = fixed 30-day aggregation. + +If Grid step is 0, the app keeps original observation coordinates. If Grid step is greater than 0, observations are assigned to regular longitude/latitude grid nodes and aggregated by grid node. + +### Step 6. Configure Derived-Metric Filters + +![Derived-metric section: 6. Derived-metric filters.](../images/presence_derived_metric_filters.png) + +After the main aggregation is completed, the app can filter the aggregated counts file using derived metrics. + +- Min frequency of detection (reporting_rate): keeps only records where the detection frequency is above the selected threshold. +- Min effort-standardized count: keeps only records where `count_per_complete_checklist` is above the selected threshold. +- Min sampling support (`n_complete_checklists`): keeps only records with enough complete checklists. + +These filters are applied after aggregation and before the species list is updated. + +### Step 7. Run Actions + +![Action section: 7. Actions.](../images/presence_actions.png) + +#### Aggregate + +Press "Aggregate" to start processing. + +The app performs the following steps: + +1. Reads the observation and sampling-event tables from local file paths. +2. Checks that required columns are present. +3. Merges observation records with sampling-event metadata using SAMPLING EVENT IDENTIFIER. +4. Parses timestamps from observation date and optional observation time. +5. Converts latitude and longitude to numeric coordinates. +6. Applies vetting and effort filters. +7. Applies spatial filtering using a polygon or bounding box. +8. Applies time filtering. +9. Assigns observations to fixed time bins. +10. Optionally assigns observations to regular grid nodes. +11. Groups observations by time bin, location, and species. +12. Calculates count, presence, sampling-support, and effort-standardised metrics. +13. Saves aggregated output files and a manifest file. + +#### Export file for ECODATA-Animate + +After aggregation, optionally select one or more species in "Species in results". If no species are selected, all species in the aggregated counts file are exported. + +The "Output filename" field is displayed in the interface with the default value `presence_points.csv`. In the current implementation, the actual generated file path is based on the run name and is saved as: + +- `__presence_points.csv` + +Press "Export file for ECODATA-Animate" to convert the aggregated counts table into a file that can be used in Ecodata Animate (the version with "Presence visualization options" tab). + +## Output Files + +The app creates three main CSV outputs and one manifest JSON file. With the default run name `presence_run`, the files are named `presence_run__agg_counts.csv`, `presence_run__agg_presence.csv`, `presence_run__manifest.json`, and `presence_run__presence_points.csv`. + +| Output file | Created when | Main purpose | +| --- | --- | --- | +| agg_counts CSV | After pressing "Aggregate" | Quantitative aggregation by time bin, location, and species. | +| agg_presence CSV | After pressing "Aggregate" | Presence aggregation by time bin, location, and species. | +| manifest JSON | After pressing "Aggregate" | Processing metadata and selected settings. | +| presence_points CSV | After pressing "Export file for ECODATA-Animate" | Animate-ready point file generated from agg_counts. | + +### Aggregated Counts File + +Default name: `presence_run__agg_counts.csv` + +It contains aggregated count and effort-standardized metrics for each species, time bin, and spatial unit. + +Typical columns include: + +- `time_bin_start`, `time_bin_end` +- `location-lat`, `location-long` +- `species` +- `total_count` +- `n_checklists`, `n_checklists_all`, `n_complete_checklists` +- `n_detected_complete_checklists` +- `sum_duration_hours_complete`, `sum_party_hours_complete` +- `reporting_rate` +- `count_per_complete_checklist` +- `count_per_hour` +- `count_per_party_hour_complete` +- `mean_count_when_detected` +- `region_id` + +### Aggregated Presence File + +Default name: `presence_run__agg_presence.csv` + +It indicates whether a species was detected in a given time bin and spatial unit. + +Typical columns include: + +- `time_bin_start`, `time_bin_end` +- `location-lat`, `location-long` +- `species` +- `presence` +- `n_checklists`, `n_checklists_all`, `n_complete_checklists` +- `n_detected_complete_checklists` +- `reporting_rate` +- `region_id` + +Note: Rows are created where a species was detected. The file does not automatically generate absence rows with presence = 0 for all possible species-location-time combinations. + +### Presence Points File for ECODATA-Animate + +Default name: `presence_run__presence_points.csv` + +This file is generated from the aggregated counts file. It reformats the aggregated output into a structure that can be used by ECODATA-Animate. + +Typical columns include: + +- `timestamp` +- `location-long`, `location-lat` +- `individual-local-identifier` +- `species` +- `count` +- `bin_id` +- `region_id` +- `total_count` +- `n_checklists`, `n_checklists_all`, `n_complete_checklists` +- `n_detected_complete_checklists` +- `sum_duration_hours_complete`, `sum_party_hours_complete` +- `reporting_rate` +- `count_per_complete_checklist` +- `count_per_hour` +- `count_per_party_hour_complete` +- `mean_count_when_detected` + +## Output Scenarios + +Scenario 1. Region polygon selected + +- Observations are filtered to the selected polygon. +- `region_id` is generated from the polygon filename. +- Output files represent only the selected region. + +Scenario 2. Bounding box selected + +- Observations are filtered to the selected rectangular extent. +- `region_id` is generated from the bounding-box coordinates. +- Output files represent the selected coordinate range. + +Scenario 3. Grid step = 0 + +- Original observation coordinates are preserved. +- Aggregation is performed by original location. +- This is useful when exact observation coordinates should be retained. + +Scenario 4. Grid step > 0 + +- Observations are assigned to regular grid nodes. +- Aggregation is performed by time bin, grid node, and species. +- This is useful for reducing spatial noise and preparing smoother maps. + +Scenario 5. Derived-metric filters enabled + +- Aggregated records are filtered after calculation. +- Low values of `reporting_rate`, effort-standardized count, `n_complete_checklists` records can be removed. + +## Expected Results + +After successful processing, you will obtain: + +1. An aggregated counts file ending with `__agg_counts.csv`. +2. An aggregated presence file ending with `__agg_presence.csv`. +3. A manifest file ending with `__manifest.json`. +4. Optionally, a ready to be animated presence points file ending with `__presence_points.csv`. + +The outputs can be used for visualising temporal changes in ECODATA-Animate (version with "presence visualization options tab"). + +## Formulas of Output Metrics + +The metrics are calculated for each aggregation group: species + time bin + location. + +The location can be either the original observation coordinates or a grid node if grid aggregation is enabled. + +a) `total_count` - Sum of all parsed OBSERVATION COUNT values for a given species, time bin, and location. If OBSERVATION COUNT = X, it is treated as 1. + +`total_count = sum of observation counts` + +b) `n_checklists` - Counts unique SAMPLING EVENT IDENTIFIER values where the species was present. + +`n_checklists = number of unique checklists where the species was detected` + +c) `n_checklists_all` - Counts all unique checklists in the same time bin and location, after filtering, regardless of species. + +`n_checklists_all = total number of unique checklists in the time-space unit` + +d) `n_complete_checklists` - Counts checklists where ALL SPECIES REPORTED is interpreted as true. + +`n_complete_checklists = number of complete checklists` + +e) `n_detected_complete_checklists` - Counts complete checklists in which the species was recorded. + +`n_detected_complete_checklists = number of complete checklists where the species was detected` + +f) `sum_duration_hours_complete` - total duration of complete checklists, in hours. Uses only complete checklists. + +`duration_hours = duration minutes / 60` + +`sum_duration_hours_complete = sum(duration_hours for complete checklists)` + +g) `sum_party_hours_complete` - total observer-effort time for complete checklists. + +`party_hours = duration_hours * number observers` + +`sum_party_hours_complete = sum(party_hours for complete checklists)` + +h) `reporting_rate` - frequency of detection, shows how often the species was detected in complete checklists. + +`reporting_rate = n_detected_complete_checklists / n_complete_checklists` + +i) `count_per_complete_checklist` - effort-standardised count per complete checklist, shows the average count relative to the number of complete checklists. + +`count_per_complete_checklist = total_count / n_complete_checklists` + +j) `count_per_hour` - count standardised by checklist duration, shows the number of observed individuals per hour of complete-checklist effort. + +`count_per_hour = total_count / sum_duration_hours_complete` + +k) `count_per_party_hour_complete` - count standardised by observer effort, accounts for both observation duration and number of observers. + +`count_per_party_hour_complete = total_count / sum_party_hours_complete` + +l) `mean_count_when_detected` - mean count per checklist where the species was detected, shows the average count only among checklists where the species was recorded. + +`mean_count_when_detected = total_count / n_checklists` + +## Notes and Limitations + +- Input tables must follow an eBird-compatible column structure. +- Data do not have to come directly from eBird, but required columns must be present. +- The app currently creates presence-only rows for detected species. It does not generate a complete absence matrix. +- If no observations remain after filtering by date, region, protocol, or effort, the output may be empty. +- If coordinates cannot be parsed as numeric values, those records are removed when "Require valid coordinates" is enabled. +- If ALL SPECIES REPORTED is missing or not marked as true, complete-checklist-based metrics may be zero or unavailable. +- `reporting_rate` depends on the availability and quality of complete checklist information. +- `count_per_hour` and `count_per_party_hour_complete` depend on duration and observer-count fields. +- The presence points file is designed for visualization and should not be treated as raw observation data. diff --git a/docs/conf.py b/docs/conf.py index 0aeb6a9..b028c26 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -167,11 +167,11 @@ def _mk_link(asset): | OS | Architecture | Download | |----------------|-----------------------|----------| | Windows | x86_64 | {_mk_link(rel['assets'].get('win_x86_64'))} | -| macOS latest | arm64 (Apple Silicon) | {_mk_link(rel['assets'].get('mac_arm64'))} | -| `(*)` macOS 13 | x86_64 | {_mk_link(rel['assets'].get('mac_x86_64'))} | +| macOS | arm64 (Apple Silicon) | {_mk_link(rel['assets'].get('mac_arm64'))} | +| macOS | x86_64 (Intel) | {_mk_link(rel['assets'].get('mac_x86_64'))} | | Linux | x86_64 | {_mk_link(rel['assets'].get('linux_x86_64'))} | -`(*)` Use this installer for all Macs that are **not** an Apple Silicon (M-series) Mac running macOS 15 (i.e., use for all x86 (Intel chip) Macs, or any Mac running macOS 13). +Use the Apple Silicon installer for M-series Macs and the Intel installer for x86_64 Macs. """ diff --git a/docs/package/developing.rst b/docs/package/developing.rst index c107691..a68634c 100644 --- a/docs/package/developing.rst +++ b/docs/package/developing.rst @@ -8,6 +8,7 @@ Installation options mamba env create -n eco-dev --file ecodata-env.yml mamba activate eco-dev mamba env update -n eco-dev -f ecodata-dev-env.yml + python -m pip install --no-deps -e . * To do a full install (not editable): @@ -55,4 +56,4 @@ In the ecodata repo actions tab, click on the conda_constructor CI tab on the le .. _`Conda | example workflow for updating a package`: https://conda-forge.org/docs/maintainer/updating_pkgs.html#example-workflow-for-updating-a-package -.. _`ecodata feedstock`: https://github.com/conda-forge/ecodata-feedstock \ No newline at end of file +.. _`ecodata feedstock`: https://github.com/conda-forge/ecodata-feedstock diff --git a/ecodata-dev-env.yml b/ecodata-dev-env.yml index 6066158..e9f8504 100644 --- a/ecodata-dev-env.yml +++ b/ecodata-dev-env.yml @@ -9,6 +9,7 @@ dependencies: - conda-forge::jupyterlab - pytest - pip +- geographiclib - pip: - black - isort @@ -24,5 +25,4 @@ dependencies: - click - build # - panel-jstree # panel needs to updated to >1 first - - -e . --global-option="--no-deps" name: eco-dev diff --git a/ecodata-env.yml b/ecodata-env.yml index 61fbcc8..ddb39d7 100644 --- a/ecodata-env.yml +++ b/ecodata-env.yml @@ -25,3 +25,7 @@ dependencies: - ffmpeg - fiona - gdown<4.6 # gdown 4.6.something has a problem with our gdrive files +- distributed +- geographiclib +- h5netcdf +- pyproj \ No newline at end of file diff --git a/ecodata/__init__.py b/ecodata/__init__.py index fb95852..c86eeb0 100644 --- a/ecodata/__init__.py +++ b/ecodata/__init__.py @@ -46,3 +46,28 @@ select_time_range, # noqa thin_dataset, # noqa ) +from ecodata.movebank_functions import( + process_csv_interp_or_averaging, # noqa + validate_and_process_csv, + merge_csv_files_from_folder, + generate_individual_csvs_for_local_ids, + delete_files +) +from ecodata.annotation_eng_func import( + load_vector_extent_info, + load_taxa_and_ids_from_csv, + convert_tif_to_nc_before_annotation, + get_nc_bounds, + safe_open_nc_with_time_decoding +) +from ecodata.multidim_annotation_func import( + sample_era5_at_height, +) + +from ecodata.presence_functions import ( + VettingOptions, + AggregationOptions, + aggregate_ebird_to_files, + export_tracks_from_aggregated_counts, + read_species_from_agg_counts, +) \ No newline at end of file diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py new file mode 100644 index 0000000..f530132 --- /dev/null +++ b/ecodata/annotation_eng_func.py @@ -0,0 +1,1718 @@ +import xarray as xr +import geopandas as gpd +from pathlib import Path +import gc +import time +import pandas as pd +import re +from shapely.geometry import Point +import numpy as np +from datetime import datetime +import rasterio +from pyproj import CRS, Transformer + +LEVEL_DIM_CANDIDATES = ("isobaricInhPa","isobaric_in_hPa","level","lev","plev","pressure","pressure_level") + + +def open_nc_metadata(path: str) -> xr.Dataset: + """ + Open a NetCDF dataset for metadata inspection only. + + This avoids time decoding so the UI can list variables and coordinate + candidates even when the time coordinate needs to be selected manually. + """ + return xr.open_dataset(path, decode_times=False, chunks="auto") + + +def detect_env_coord_names(ds: xr.Dataset) -> dict: + """ + Detect likely coordinate names for an environmental dataset. + + Returns keys: env_time, env_x, env_y, env_lat, env_lon. + Values may be None when not detected. + """ + env_time = _detect_time_name(ds) + + x_candidates = ("x", "X", "projection_x_coordinate", "easting", "eastings") + y_candidates = ("y", "Y", "projection_y_coordinate", "northing", "northings") + lat_candidates = ("lat", "latitude", "Latitude") + lon_candidates = ("lon", "longitude", "Longitude", "long") + + env_x = next((c for c in x_candidates if c in ds.coords and c in ds.dims), None) + env_y = next((c for c in y_candidates if c in ds.coords and c in ds.dims), None) + env_lat = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + env_lon = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + + return { + "env_time": env_time, + "env_x": env_x, + "env_y": env_y, + "env_lat": env_lat, + "env_lon": env_lon, + } + + +def safe_open_nc_with_time_decoding(path, time_name: str | None = None): + """ + Opens a NetCDF file with support for non-standard calendars: + julian, gregorian, 360_day, noleap, etc. + Always returns the 'time' coordinate as a pd.DatetimeIndex, + even if it was originally of cftime type. + """ + + try: + ds = xr.open_dataset(path, decode_times=False, chunks="auto") + + if time_name is None: + time_name = _detect_time_name(ds) + + if time_name is None: + raise ValueError("No time-like coordinate/variable found (e.g., 'time', 'valid_time').") + + # if time is in variables but not in coords — make it a coordinate + if time_name in ds.variables and time_name not in ds.coords: + ds = ds.set_coords(time_name) + + time_var = ds[time_name] + units = str(time_var.attrs.get("units","")) + calendar = str(time_var.attrs.get("calendar","standard")).lower() + + if "since" not in units: + # sometimes there are "epoch seconds" without 'since' + # add default: seconds since 1970-01-01 + if units.strip() == "" and pd.api.types.is_integer_dtype(time_var.dtype): + units = "seconds since 1970-01-01" + calendar = "proleptic_gregorian" + + decoded = xr.coding.times.decode_cf_datetime(time_var.values, units, calendar) + # if these are cftime objects — convert via str + if hasattr(decoded[0], "strftime"): + decoded = pd.to_datetime([str(d) for d in decoded]) + else: + decoded = pd.to_datetime(decoded) + + # rename the time coordinate to the unified 'time' + if time_name != "time": + ds = ds.assign_coords({time_name: decoded}).rename({time_name: "time"}) + else: + ds = ds.assign_coords(time=decoded) + + return ds + + except Exception as e: + raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") + + +def get_nc_timerange_for_selected( + env_var_map: dict, + selected_env_vars: list[str], + time_name: str | None = None, +): + """ + Return union [nc_start, nc_end] across all selected variables. + If time is missing for all → (None, None). + """ + nc_start, nc_end = None, None + for v in (selected_env_vars or []): + nc_path = env_var_map.get(v) + if not nc_path: + continue + ds = safe_open_nc_with_time_decoding(nc_path, time_name=time_name) + try: + if ("time" in ds.coords) or ("time" in ds.variables): + tmin = pd.to_datetime(ds["time"].values.min()) + tmax = pd.to_datetime(ds["time"].values.max()) + nc_start = tmin if (nc_start is None or tmin < nc_start) else nc_start + nc_end = tmax if (nc_end is None or tmax > nc_end) else nc_end + finally: + ds.close() + return nc_start, nc_end + + +def get_nc_bounds(nc_path: str, env_coord_names: dict | None = None): + """ + Returns a dictionary of boundaries from .nc in CRS WGS84: {"S": ..., "N": ..., "W": ..., "E": ...} + """ + env_coord_names = env_coord_names or {} + ds = safe_open_nc_with_time_decoding(nc_path, time_name=env_coord_names.get("env_time")) + # candidate coordinate names + try: + lat_name = env_coord_names.get("env_lat") + lon_name = env_coord_names.get("env_lon") + + if not lat_name or not lon_name: + lat_candidates = ("lat", "latitude", "Latitude") + lon_candidates = ("lon", "longitude", "Longitude", "long") + lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + + if lat_name is None or lon_name is None: + raise ValueError("Could not detect lat/lon coordinate names in NetCDF") + + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} + finally: + ds.close() + +def remove_temporary_trimmed_file(trimmed_path): + """Remove temporary trimmed.csv created during spatial filtering.""" + if trimmed_path is None: + return + + try: + path = Path(trimmed_path) + if path.exists() and path.is_file(): + path.unlink() + print(f"[INFO] Temporary file removed: {path}") + except Exception as e: + print(f"[WARNING] Could not remove temporary trimmed.csv: {e}") + +def load_vector_extent_info(path): + try: + ext = Path(path).suffix.lower() + if ext not in [".shp", ".geojson"]: + raise ValueError("Unsupported file format. Please select a .shp or .geojson file.") + + gdf = gpd.read_file(path) + bounds = gdf.total_bounds # [minx, miny, maxx, maxy] + west, south, east, north = bounds + return path, south, north, west, east + except Exception as e: + raise RuntimeError(f"Failed to load vector file: {e}") + + +def load_taxa_and_ids_from_csv(file_path): + """ + Reads a Movebank-style CSV and returns: + - DataFrame + - List of unique taxon names + - List of unique individual IDs + """ + try: + df = pd.read_csv(file_path) + columns = {re.sub(r"[-._\s]+", "_", col.lower()): col for col in df.columns} + id_key = "individual_local_identifier" + taxon_key = "individual_taxon_canonical_name" + id_col = columns.get(id_key) + taxon_col = columns.get(taxon_key) + if id_col is None: + return None, [], [], "No column found for individual-local-identifier" + + unique_ids = sorted(df[id_col].dropna().astype(str).unique()) + unique_taxa = sorted(df[taxon_col].dropna().astype(str).unique()) if taxon_col else [] + + return df, unique_taxa, unique_ids, None + + except Exception as e: + return None, [], [], str(e) + + +def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, + boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, + out_csv_path=None, coord_spec=None, + env_coord_names: dict | None = None, + continuous_vars=None, categorical_vars=None, + apply_value_correction: bool = False, + value_scale_factor: float = 1.0, + value_add_offset: float = 0.0, + value_correction_vars=None): + """ + env_var_map: dict[str, str] — variable → file path + selected_env_vars: list[str] — selected variables + movebank_path: str — path to the Movebank CSV + selected_ids: list[str] — IDs for annotation + boundary_path: str — path to .shp or .geojson + """ + print("[DEBUG] Annotation started") + print("Selected variables:", selected_env_vars) + print("From files:", [env_var_map.get(v) for v in selected_env_vars]) + print("Selected IDs:", selected_ids) + print("Movebank file:", movebank_path) + print("Boundary file:", boundary_path) + print("Interpolation method:", interpolation_method) + env_coord_names = env_coord_names or {} + + # bridge from the current coord_spec logic to the #191 commit's env_coord_names naming. + + if not env_coord_names and coord_spec: + env_coord_names = { + "env_time": coord_spec.get("time"), + "env_lat": coord_spec.get("lat"), + "env_lon": coord_spec.get("lon"), + "env_x": None, + "env_y": None, + } + + # === Step 1: Spatial filtering === + df_filtered, trimmed_path = filter_points_within_boundary( + movebank_path, selected_ids, boundary_path, bbox=bbox + ) + + if df_filtered.empty: + print("[WARNING] No points within the boundary.") + remove_temporary_trimmed_file(trimmed_path) + return + + # ===*** Time prefiltering (union across selected variables) === + nc_start, nc_end = get_nc_timerange_for_selected( + env_var_map, + selected_env_vars, + time_name=env_coord_names.get("env_time"), + ) + df_filtered = filter_points_within_timerange(df_filtered, nc_start, nc_end) + if df_filtered.empty: + print("[WARNING] No points within the NC time window after prefiltering.") + remove_temporary_trimmed_file(trimmed_path) + return + # ===*** + + # === Step 2: Loading and interpolation of environmental data === + result = load_selected_environmental_data( + df_filtered, + env_var_map, + selected_env_vars, + movebank_path, + interpolation_method, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars, + ) + if result is None: + print("[ERROR] Environmental data was not loaded.") + remove_temporary_trimmed_file(trimmed_path) + return + + df_annotated, ann_nc_start, ann_nc_end = result + # Optional post-sampling value correction + # Apply only to continuous variables after sampling/interpolation. + # This is methodologically safe for linear scale/offset: + # physical_value = raw_value * scale_factor + add_offset. + # Categorical/QC variables must remain as raw category/flag codes. + if apply_value_correction: + if value_correction_vars is None: + correction_vars = list(continuous_vars or []) + else: + correction_vars = list(value_correction_vars or []) + + try: + scale = float(value_scale_factor) + offset = float(value_add_offset) + except Exception as e: + raise ValueError(f"Invalid scale factor / offset: {e}") + + for v in correction_vars: + if v not in df_annotated.columns: + print(f"[WARNING] Scale/offset skipped for '{v}': column not found.") + continue + + # Convert only the annotated continuous column. + # Non-numeric values become NaN, which is acceptable for continuous variables. + df_annotated[v] = pd.to_numeric(df_annotated[v], errors="coerce") * scale + offset + + print( + "[INFO] Applied post-sampling scale/offset to continuous variables: " + f"{correction_vars}; scale={scale}, offset={offset}" + ) + # Keep the real union NC range computed before annotation, + # unless an annotator explicitly returns a valid range in the future. + if not pd.isna(ann_nc_start): + nc_start = ann_nc_start + if not pd.isna(ann_nc_end): + nc_end = ann_nc_end + +#### diagnostic + var = selected_env_vars[0] if selected_env_vars else None + if var in df_annotated.columns: + in_nc = df_annotated["timestamp"].between( + pd.to_datetime(df_annotated["timestamp"]).min() if pd.isna(nc_start) else nc_start, + pd.to_datetime(df_annotated["timestamp"]).max() if pd.isna(nc_end) else nc_end + ) + filled_total = df_annotated[var].notna().sum() + filled_in_nc = df_annotated.loc[in_nc, var].notna().sum() + print(f"[DEBUG] Filled '{var}': total={filled_total}, within-NC-window={filled_in_nc}") + else: + print(f"[WARNING] Column '{var}' not found in annotated DataFrame.") + + # === Step 3: Time filtering === + df_time_filtered = df_annotated.copy() + print("[INFO] Full timestamp range preserved. Outside-NC values will be NaN.") + + # === Step 4: Saving the final result === + if out_csv_path: + out_path = Path(out_csv_path) + else: + out_path = Path(movebank_path).parent / "annotated_env.csv" + df_time_filtered = df_time_filtered.drop(columns=["geometry", "nc_lat", "nc_lon", "x", "y"], errors="ignore") + df_time_filtered.to_csv(out_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") + print(f"[INFO] Final filtered annotation saved to {out_path}") + + # === Step 5: Saving by individual ID === + output_folder = out_path.parent / "annotated_individuals" + output_folder.mkdir(parents=True, exist_ok=True) + + id_col = "individual_local_identifier" + if id_col in df_time_filtered.columns: + unique_ids = df_time_filtered[id_col].dropna().unique() + for uid in unique_ids: + df_id = df_time_filtered[df_time_filtered[id_col] == uid] + safe_uid = re.sub(r"[^\w\-]", "_", str(uid)) + out_file = output_folder / f"annotated_env_{safe_uid}.csv" + df_id.to_csv(out_file, index=False) + print(f"[INFO] Saved {len(unique_ids)} individual files to {output_folder}") + else: + print("[WARNING] Column 'individual_local_identifier' not found. Skipping per-ID export.") + remove_temporary_trimmed_file(trimmed_path) + + +def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=None, bbox=None): + print("[DEBUG] Filtering is started") + df = pd.read_csv(movebank_path) + df.columns = [re.sub(r"[-:.\s]+", "_", col.lower()) for col in df.columns] + # --- unify longitude column to location_lon --- + if "location_lon" in df.columns and "location_long" in df.columns: + # both exist -> keep location_lon (canonical), drop location_long + df = df.drop(columns=["location_long"]) + elif "location_lon" not in df.columns and "location_long" in df.columns: + # only location_long -> rename to canonical location_lon + df = df.rename(columns={"location_long": "location_lon"}) + if "timestamp" not in df.columns and "eobs_start_timestamp" in df.columns: + df["timestamp"] = df["eobs_start_timestamp"] + + required_cols = {"location_lat", "location_lon", "individual_local_identifier", "timestamp"} + if not required_cols.issubset(df.columns): + raise ValueError(f"Required columns are missing in Movebank file. Missing: {required_cols - set(df.columns)}") + + # ID-filter + df = df[df["individual_local_identifier"].isin(selected_ids)] + df = interpolate_missing_coordinates(df) + + output_path = Path(movebank_path).parent / "trimmed.csv" + if bbox is not None: + S, N, W, E = map(float, (bbox["S"], bbox["N"], bbox["W"], bbox["E"])) + m = df["location_lat"].between(S, N) & df["location_lon"].between(W, E) + df = df.loc[m].copy() + df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] + gdf_filtered = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + try: + if gdf_filtered.empty: + print("[INFO] No points within bbox. File not saved.") + else: + gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] (bbox) Data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save (bbox) data: {e}") + return gdf_filtered, output_path + + # case: boundary from shp/geojson + df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] + gdf_points = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + if boundary_path is None: + print("[INFO] No boundary provided. Skipping spatial clipping (all selected IDs kept).") + try: + gdf_points.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] (No-boundary) Data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save (no-boundary) data: {e}") + return gdf_points, output_path + + gdf_boundary = gpd.read_file(boundary_path) + if gdf_boundary.crs != gdf_points.crs: + gdf_boundary = gdf_boundary.to_crs(gdf_points.crs) + + gdf_filtered = gpd.sjoin(gdf_points, gdf_boundary[["geometry"]], predicate="within", how="inner").drop(columns="index_right") + + try: + if gdf_filtered.empty: + print("[INFO] No points within boundary. File not saved.") + else: + gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] Filtered data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save filtered data: {e}") + + return gdf_filtered, output_path + + +def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_end: pd.Timestamp) -> pd.DataFrame: + df = df.copy() + if nc_start is None or nc_end is None: + print("[INFO] NC union time range unavailable. Skipping time prefilter.") + return df + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") + before = len(df) + filtered_df = df[(df["timestamp"] >= nc_start) & (df["timestamp"] <= nc_end)] + print(f"[INFO] Time-prefiltered rows: {len(filtered_df)} / {before} within [{nc_start} .. {nc_end}]") + return filtered_df + + +def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: + """ + Interpolates missing values in 'location_lat' and 'location_lon' columns + based on the 'timestamp'. Removes rows with invalid timestamps. + """ + required_cols = {"timestamp", "location_lat", "location_lon"} + if not required_cols.issubset(df.columns): + raise ValueError(f"DataFrame must contain columns: {required_cols}") + + df = df.copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") + + n_missing = df["timestamp"].isna().sum() + if n_missing > 0: + print(f"[INFO] {n_missing} rows with missing or invalid timestamps were removed before interpolation.") + + df = df.dropna(subset=["timestamp"]) # Remove Na before creating the index + df = df.sort_values("timestamp") + df.set_index("timestamp", inplace=True) + + for coord in ["location_lat", "location_lon"]: + df[coord] = pd.to_numeric(df[coord], errors="coerce") + + df[["location_lat", "location_lon"]] = df[["location_lat", "location_lon"]].interpolate( + method="time", limit_direction="both" + ) + + df = df.reset_index() + return df + + +def load_selected_environmental_data(df, env_var_map, selected_vars, + movebank_path, interpolation_method="Nearest neighbour", + smoothing_k: int = 2, + coord_spec=None, + env_coord_names: dict | None = None, + continuous_vars=None, + categorical_vars=None): + """ + Wrapper that calls the appropriate annotation function depending on the interpolation method. + + Current behaviour: + - Continuous + Nearest neighbour: + nearest spatial grid node + linear temporal interpolation + - Continuous + IDW: + k nearest spatial grid nodes + linear temporal interpolation per node + IDW + - Categorical/QC + Nearest neighbour: + nearest spatial grid node + nearest timestep + - Categorical/QC + IDW selected: + categorical/QC variables are not IDW-averaged; + they use nearest spatial grid node + nearest timestep + - Continuous + Bilinear projected x/y: + bilinear interpolation on a projected 1D x/y grid + linear temporal interpolation + - Categorical/QC + Bilinear projected x/y: + not allowed, because bilinear interpolation is not valid for class/flag codes + """ + label = (interpolation_method or "").strip().lower() + label = label.replace("neighbor", "neighbour") # Normalise US/UK spelling + + is_nearest = label.startswith("nearest") + is_idw = ("idw" in label) or ("inverse distance" in label) + + # Normalize interpolation method + method = (interpolation_method or "").lower() + is_nearest = ("nearest" in method) + is_idw = ("idw" in method) + is_bilinear = "bilinear" in method + + # If split lists are not provided, treat everything as "selected_vars" + cont = list(continuous_vars or []) + cat = list(categorical_vars or []) + + if not cont and not cat: + # everything in selected_vars, method applies to all + if is_nearest: + return annotate_env_nearest( + df, env_var_map, selected_vars, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + ) + + if is_idw: + return annotate_env_IDW( + df, env_var_map, selected_vars, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + ) + + if is_bilinear: + return annotate_env_bilinear_projected( + df, + env_var_map, + selected_vars, + movebank_path, + env_coord_names=env_coord_names, + ) + + raise ValueError(f"Unknown interpolation method: {interpolation_method}") + + # If split lists are provided: + # 1) Nearest selected: + # continuous -> nearest grid node + linear time interpolation + # categorical/QC -> nearest grid node + nearest timestep + if is_nearest: + out_df = df + nc_start = pd.NaT + nc_end = pd.NaT + + # Continuous: nearest grid node + linear time interpolation + if cont: + out_df, nc_start, nc_end = annotate_env_nearest( + out_df, env_var_map, cont, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + temporal_method="linear" + ) + + # Categorical/QC: nearest grid node + nearest timestep + if cat: + out_df, nc_start2, nc_end2 = annotate_env_nearest( + out_df, env_var_map, cat, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + temporal_method="nearest" + ) + + if pd.isna(nc_start) and not pd.isna(nc_start2): + nc_start = nc_start2 + if pd.isna(nc_end) and not pd.isna(nc_end2): + nc_end = nc_end2 + + return out_df, nc_start, nc_end + + # 2) IDW selected -> cont=IDW, cat=NN + if is_idw: + out_df = df + nc_start = pd.NaT + nc_end = pd.NaT + + # continuous via IDW + if cont: + out_df, nc_start, nc_end = annotate_env_IDW( + out_df, env_var_map, cont, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + temporal_method="linear" + ) + + # categorical via Nearest neighbour in space + nearest timestep in time + if cat: + out_df, nc_start2, nc_end2 = annotate_env_nearest( + out_df, env_var_map, cat, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + temporal_method="nearest" + ) + # keep nc_start/nc_end stable (both annotators return NaT) + if pd.isna(nc_start) and not pd.isna(nc_start2): + nc_start = nc_start2 + if pd.isna(nc_end) and not pd.isna(nc_end2): + nc_end = nc_end2 + + return out_df, nc_start, nc_end + + # 3) Bilinear projected selected: + # continuous -> bilinear projected x/y + linear time + # categorical/QC -> not allowed + if is_bilinear: + if cat: + raise ValueError( + "Bilinear projected interpolation is only valid for continuous variables. " + "Please remove categorical/QC variables or use Nearest/IDW mode." + ) + + bilinear_vars = cont if cont else list(selected_vars or []) + + if not bilinear_vars: + raise ValueError("No continuous variables selected for bilinear projected interpolation.") + + return annotate_env_bilinear_projected( + df, + env_var_map, + bilinear_vars, + movebank_path, + env_coord_names=env_coord_names, + ) + + raise ValueError(f"Unknown interpolation method: {interpolation_method}") + + + +def standardize_time_lat_lon(ds, coord_spec): + mapping = {} + if coord_spec: + for std in ("time", "lat", "lon"): + chosen = coord_spec.get(std) + if chosen and chosen in ds.variables and chosen != std: + mapping[chosen] = std + + if mapping: + ds = ds.rename(mapping) + + for req in ("time", "lat", "lon"): + if req not in ds.variables: + raise ValueError( + f"Missing required '{req}' variable after user selection. " + f"Selected: {coord_spec}. Available: {list(ds.variables.keys())}" + ) + return ds + + +def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4, + coord_spec=None, env_coord_names: dict | None = None, + temporal_method: str = "linear"): + """ + Annotate movement points with environmental values using: + - Spatial: nearest grid node + - Temporal: + * "linear" -> vectorised linear interpolation in time, for continuous variables + * "nearest" -> nearest available timestep, for categorical/QC variables + + This version supports "expanded" variable labels that include a pressure/vertical level, + e.g. "v_1000", "v_975", ... For such labels, the base variable ("v") is taken from the + NetCDF, and the closest level to the requested value (e.g. 1000 hPa) is selected along + the appropriate vertical dimension (e.g. isobaricInhPa/level/lev/plev/...). + + Parameters + ---------- + df : pandas.DataFrame + Movebank-like table with columns: timestamp, location_lat, location_lon, etc. + env_var_map : dict[str, str] + Mapping from UI label to NetCDF path, e.g. {"v_1000": "/path/file.nc"}. + selected_vars : list[str] + Labels picked in the UI; labels may be plain vars ("t2m") or var+level ("v_850"). + movebank_path : str + Used only for output file placement upstream in the pipeline. + smoothing_k : int + Unused in the nearest-neighbour branch (kept for signature symmetry). + + Returns + ------- + (out_df, nc_start, nc_end) + `out_df` includes new columns for each selected label; nc_* are placeholders here. + + Notes + ----- + - Assumes `safe_open_nc_with_time_decoding` and `_ensure_sorted` are available in scope. + - Column names in the result exactly match `selected_vars` (e.g. "v_1000"). + """ + def _nearest_indices_vectorized(arr, vals): + """ + Fast nearest-index for a (monotonic) 1D array `arr` + against multiple query values `vals` (vectorised). + """ + idx = np.searchsorted(arr, vals) + idx = np.clip(idx, 0, len(arr) - 1) + left = np.maximum(idx - 1, 0) + take_left = (idx > 0) & (np.abs(arr[left] - vals) <= np.abs(arr[idx] - vals)) + return np.where(take_left, left, idx) + + # --- input prep ----------------------------------------------------------- + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + + temporal_method = (temporal_method or "linear").strip().lower() + if temporal_method not in ("linear", "nearest"): + temporal_method = "linear" + env_coord_names = env_coord_names or {} + + # Placeholders for nearest grid coords (one set; overwritten by last variable) + nc_latitudes = np.full(len(out), np.nan, dtype="float64") + nc_longitudes = np.full(len(out), np.nan, dtype="float64") + + # Target times as int64 ns, used for either np.interp or nearest-time lookup. + tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") + + # --- main loop over requested labels ------------------------------------- + for label in selected_vars: + file_path = env_var_map.get(label) + if temporal_method == "nearest": + # Categorical/QC-safe column: preserve integer codes or labels if present. + out[label] = pd.Series([pd.NA] * len(out), index=out.index, dtype="object") + else: + out[label] = np.nan # continuous numeric column + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {label} not found: {file_path}") + continue + + # Split the UI label into (base_var, requested_level) + base_var, target_level = _split_var_and_level(label) + + try: + ds = safe_open_nc_with_time_decoding( + file_path, + time_name=env_coord_names.get("env_time"), + ) + ds = standardize_time_lat_lon(ds, coord_spec) + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") + ds.close() + continue + + da = ds[base_var] + dims = list(da.dims) + + # Detect lat/lon names; keep dataset sorted in both + lat_dim = "lat" if "lat" in dims else "latitude" + lon_dim = "lon" if "lon" in dims else "longitude" + ds = _ensure_sorted(ds, lat_dim, lon_dim) + da = ds[base_var] + dims = list(da.dims) + + # Unify/ensure time dimension is named 'time' + time_dim = "time" if "time" in dims else next( + (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), + None + ) + if time_dim is None: + ds.close() + raise ValueError(f"No time-like dimension in '{base_var}': dims={dims}") + if time_dim != "time": + ds = ds.rename({time_dim: "time"}) + da = ds[base_var] + dims = list(da.dims) + + # Resolve extra dimensions (pressure level, ensemble, expver, etc.) + # For the "level" dim: pick closest to `target_level` (or 1000 hPa by default). + extra = [d for d in dims if d not in ("time", lat_dim, lon_dim)] + if extra: + sel = {} + for d in extra: + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) + else: + sel[d] = 0 # deterministic default for non-level extra dims + da = da.isel(**sel).squeeze() # now expected shape: (time, lat, lon) + + # Grid coordinate vectors + glat = ds[lat_dim].values + glon = ds[lon_dim].values + gtime = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") + + # Vectorised nearest grid-node indices for all points + lat_idx = _nearest_indices_vectorized(glat, out["location_lat"].to_numpy(dtype="float64")) + lon_idx = _nearest_indices_vectorized(glon, out["location_lon"].to_numpy(dtype="float64")) + + # Store the matched grid coordinates (useful for QA) + nc_latitudes[:] = glat[lat_idx] + nc_longitudes[:] = glon[lon_idx] + + # Group points by grid cell (to read each per-cell time series only once) + cell_code = (lat_idx.astype(np.int64) * len(glon)) + lon_idx.astype(np.int64) + unique_cells, inverse = np.unique(cell_code, return_inverse=True) + + # Cache of per-cell series: (ii, jj) -> 1D array over time. + # For continuous variables this is float64; for categorical/QC variables the original dtype is preserved. + series_cache: dict[tuple[int, int], np.ndarray] = {} + col_idx = out.columns.get_loc(label) + + for g, code in enumerate(unique_cells): + ii = int(code // len(glon)) + jj = int(code % len(glon)) + + pos = np.nonzero(inverse == g)[0] # row indices in `out` for this cell + xi = tgt_times[pos] # target times (int64 ns) + + key = (ii, jj) + if key not in series_cache: + raw_series = da.isel({lat_dim: ii, lon_dim: jj}).values + + if temporal_method == "nearest": + # Keep original dtype for categorical/QC variables. + # This avoids converting category codes to float and also supports non-numeric labels. + series_cache[key] = raw_series + else: + # Continuous variables: cast to float64 for np.interp. + series_cache[key] = raw_series.astype("float64") + + y = series_cache[key] + + if temporal_method == "nearest": + # Categorical/QC-safe temporal sampling: + # take the value from the nearest available timestep, no interpolation. + m = pd.notna(y) + if m.sum() < 1: + out.iloc[pos, col_idx] = np.nan + continue + + x = gtime[m] # source times, int64 ns + yy = y[m] # source values, may be integer/category codes + + # Ensure time is sorted + order = np.argsort(x) + x = x[order] + yy = yy[order] + + idx = np.searchsorted(x, xi) + right = np.clip(idx, 0, len(x) - 1) + left = np.clip(idx - 1, 0, len(x) - 1) + + use_left = ( + (idx > 0) + & ( + (idx == len(x)) + | (np.abs(xi - x[left]) <= np.abs(x[right] - xi)) + ) + ) + + nearest_idx = np.where(use_left, left, right) + vals = yy[nearest_idx] + + # Keep existing "no extrapolation" behaviour: + # points outside the native NC time range remain NaN. + vals = vals.astype("object") + vals[(xi < x.min()) | (xi > x.max())] = np.nan + + out.iloc[pos, col_idx] = vals + + else: + # Continuous variables: existing linear temporal interpolation. + y_float = y.astype("float64") + m = np.isfinite(y_float) + if m.sum() < 2: + out.iloc[pos, col_idx] = np.nan + continue + + x = gtime[m] + yy = y_float[m] + + order = np.argsort(x) + x = x[order] + yy = yy[order] + + vals = np.interp(xi, x, yy) + + # Outside native time range → NaN + vals[(xi < x.min()) | (xi > x.max())] = np.nan + + out.iloc[pos, col_idx] = vals + + ds.close() + + except Exception as e: + print(f"[ERROR] {label}: {e}") + continue + + # Final QA columns + out["nc_lat"] = nc_latitudes + out["nc_lon"] = nc_longitudes + out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + + # Harmonise return signature with the rest of your pipeline + return out, pd.NaT, pd.NaT + + +def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2, + coord_spec=None, env_coord_names: dict | None = None, + temporal_method: str = "linear"): + """ + Annotate movement points with environmental values using: + - Spatial: Inverse Distance Weighting (IDW) over k nearest grid nodes + - Temporal: + * "linear" -> 1D linear interpolation in time per grid node + * "nearest" -> nearest available timestep per grid node + + Important: + IDW is suitable for continuous numeric variables. Even with temporal_method="nearest", + spatial IDW still averages values across neighbouring grid nodes, so it is not + recommended for true categorical/QC variables. + + This version understands expanded variable labels that include a pressure/vertical level, + e.g. "v_1000", "v_975". It will: + 1) parse the UI label into (base_var, target_level), + 2) find a known vertical dimension (isobaricInhPa/level/lev/plev/...), + 3) slice the DataArray to the closest level to `target_level` (or 1000 hPa by default). + + Parameters + ---------- + df : pandas.DataFrame + Movebank-like table with columns: timestamp, location_lat, location_lon, etc. + env_var_map : dict[str, str] + Mapping from UI label to NetCDF path, e.g. {"v_1000": "/path/file.nc"}. + selected_vars : list[str] + Labels picked in the UI; each label becomes a column in the output. + movebank_path : str + Kept for signature symmetry with the rest of the pipeline (output path handled upstream). + smoothing_k : int + Number of nearest grid nodes for IDW (>=2). + + Returns + ------- + (out_df, nc_start, nc_end) + `out_df` contains new columns with the same names as `selected_vars`. + `nc_start`, `nc_end` are placeholders here (NaT). + """ + # --- input prep ---------------------------------------------------------------- + k = max(2, int(smoothing_k)) + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + + temporal_method = (temporal_method or "linear").strip().lower() + if temporal_method not in ("linear", "nearest"): + temporal_method = "linear" + env_coord_names = env_coord_names or {} + + # Keep nc_lat/nc_lon semantics consistent with prior implementation (copy of point coords) + out["nc_lat"] = out["location_lat"].values + out["nc_lon"] = out["location_lon"].values + + # Vectorised numeric targets for temporal interpolation + tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") + lat_vals = out["location_lat"].to_numpy(dtype="float64") + lon_vals = out["location_lon"].to_numpy(dtype="float64") + + # --- main loop over labels ----------------------------------------------------- + for label in selected_vars: + file_path = env_var_map.get(label) + if temporal_method == "nearest": + # Nearest-time mode: preserve raw values before spatial handling. + # Note: spatial IDW is still numeric and is not recommended for true categorical/QC variables. + out[label] = pd.Series([pd.NA] * len(out), index=out.index, dtype="object") + else: + out[label] = np.nan # continuous numeric column + + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {label} not found: {file_path}") + continue + + # Split label into base variable and optional requested level + base_var, target_level = _split_var_and_level(label) + + try: + ds = safe_open_nc_with_time_decoding( + file_path, + time_name=env_coord_names.get("env_time"), + ) + ds = standardize_time_lat_lon(ds, coord_spec) + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not in {file_path}") + ds.close() + continue + + da = ds[base_var] + dims = list(da.dims) + + # Detect coordinate names and sort dataset (required by nearest/k-nearest search) + lat_dim = "lat" if "lat" in dims else "latitude" + lon_dim = "lon" if "lon" in dims else "longitude" + ds = _ensure_sorted(ds, lat_dim, lon_dim) + da = ds[base_var] + dims = list(da.dims) + + # Unify time dimension name to 'time' + time_dim = "time" if "time" in dims else next( + (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), None + ) + if time_dim is None: + ds.close() + raise ValueError(f"No time-like dimension in '{base_var}': dims={dims}") + if time_dim != "time": + ds = ds.rename({time_dim: "time"}) + da = ds[base_var] + dims = list(da.dims) + + # Resolve extra dimensions (pressure level, ensemble, expver, etc.) + extra_dims = [d for d in dims if d not in ("time", lat_dim, lon_dim)] + if extra_dims: + sel = {} + for d in extra_dims: + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) + else: + sel[d] = 0 # deterministic default for non-level dims + da = da.isel(**sel).squeeze() # -> (time, lat, lon) + + # Coordinate vectors + glat = ds[lat_dim].values + glon = ds[lon_dim].values + gtime_int = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") + + # Cache per-grid-node time series (to avoid repeated reads for neighbors) + # key: (ii, jj) -> (x_int64_valid, y_valid) + # For linear mode y_valid is float64; for nearest-time mode original dtype is preserved. + series_cache: dict[tuple[int, int], tuple[np.ndarray, np.ndarray]] = {} + col_idx = out.columns.get_loc(label) + + # Row-wise IDW over k nearest grid nodes + for i in range(len(out)): + t_i = tgt_times[i] + xlat = lat_vals[i] + xlon = lon_vals[i] + + # If outside the native time span → keep NaN + if t_i < gtime_int.min() or t_i > gtime_int.max(): + continue + + nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) # provided elsewhere + vals = np.empty(k, dtype="float64") + dists = np.empty(k, dtype="float64") + + for j, (ii, jj) in enumerate(nn_idx): + key = (ii, jj) + if key not in series_cache: + raw_y = da.isel({lat_dim: ii, lon_dim: jj}).values + + if temporal_method == "nearest": + # Keep original values for nearest-time lookup. + m = pd.notna(raw_y) + if m.sum() >= 1: + x = gtime_int[m] + yy = raw_y[m] + + order = np.argsort(x) + x = x[order] + yy = yy[order] + else: + x = np.empty(0, dtype="int64") + yy = np.empty(0, dtype=raw_y.dtype) + + else: + # Linear interpolation requires numeric float values. + y = raw_y.astype("float64") + m = np.isfinite(y) + if m.sum() >= 2: + x = gtime_int[m] + yy = y[m] + + order = np.argsort(x) + x = x[order] + yy = yy[order] + else: + x = np.empty(0, dtype="int64") + yy = np.empty(0, dtype="float64") + + series_cache[key] = (x, yy) + + x, yy = series_cache[key] + if temporal_method == "nearest": + if x.size < 1: + vals[j] = np.nan + else: + idx = np.searchsorted(x, t_i) + + right = np.clip(idx, 0, len(x) - 1) + left = np.clip(idx - 1, 0, len(x) - 1) + + use_left = ( + (idx > 0) + and ( + (idx == len(x)) + or (abs(t_i - x[left]) <= abs(x[right] - t_i)) + ) + ) + + nearest_idx = left if use_left else right + v = yy[nearest_idx] + + # Keep no-extrapolation behaviour. + if (t_i < x.min()) or (t_i > x.max()): + v = np.nan + + vals[j] = v + + else: + if x.size < 2: + vals[j] = np.nan + else: + v = np.interp(t_i, x, yy) + + # Keep no-extrapolation behaviour. + if (t_i < x.min()) or (t_i > x.max()): + v = np.nan + + vals[j] = v + + # Planar Euclidean distance in degrees (consistent with prior code) + dists[j] = np.hypot(glat[ii] - xlat, glon[jj] - xlon) + + out.iloc[i, col_idx] = _idw(vals, dists, p=2) # provided elsewhere + + ds.close() + + except Exception as e: + print(f"[ERROR] {label}: {e}") + continue + + # Geometry for QA/exports + out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + return out, pd.NaT, pd.NaT + +def annotate_env_bilinear_projected( + df, + env_var_map, + selected_vars, + movebank_path, + env_coord_names: dict | None = None, +): + """ + Annotate movement points with environmental values using: + - Spatial: bilinear interpolation on a 1D projected grid (x/y) + - Temporal: linear interpolation in time (xarray interp) + + Tracks input: + - requires lon/lat columns: location_lon, location_lat + - projects lon/lat -> x/y into the env dataset's native CRS using CF metadata + + Env input: + - dataset has 1D x and y coordinate vectors (projected grid) + - dataset provides CF projection metadata so `read_crs_from_cf()` can infer CRS + + Returns: (out_df, pd.NaT, pd.NaT) for signature compatibility. + """ + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + + # Require lon/lat (your code already normalizes movement columns sometimes) + required = ["timestamp", "location_lat", "location_lon"] + out = out.dropna(subset=required) + + env_coord_names = env_coord_names or {} + time_name = env_coord_names.get("env_time") # optional + x_name = env_coord_names.get("env_x") + y_name = env_coord_names.get("env_y") + + if not x_name or not y_name: + raise ValueError( + "Bilinear (projected) requires env_coord_names['env_x'] and ['env_y'] " + "(Projected (x/y) mode)." + ) + if env_coord_names.get("env_lat") or env_coord_names.get("env_lon"): + raise ValueError("Bilinear (projected) requires Projected (x/y) spatial mode, not Geographic (lat/lon).") + + # Target time values (vectorized) + tgt_t = out["timestamp"].to_numpy("datetime64[ns]") + + # Track lon/lat arrays + lon = pd.to_numeric(out["location_lon"], errors="coerce").to_numpy(dtype="float64") + lat = pd.to_numeric(out["location_lat"], errors="coerce").to_numpy(dtype="float64") + + # Drop any rows with bad numeric lon/lat + good = np.isfinite(lon) & np.isfinite(lat) & out["timestamp"].notna().to_numpy() + if not good.all(): + out = out.loc[good].copy() + tgt_t = tgt_t[good] + lon = lon[good] + lat = lat[good] + + # QA columns + out["x"] = np.nan + out["y"] = np.nan + + # Cache CRS/transformer per file path (since you may have multiple labels/files) + crs_cache: dict[str, "CRS"] = {} + + for label in selected_vars: + file_path = env_var_map.get(label) + out[label] = np.nan + + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {label} not found: {file_path}") + continue + + base_var, target_level = _split_var_and_level(label) + + try: + ds = safe_open_nc_with_time_decoding(file_path, time_name=time_name) + + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") + ds.close() + continue + + da = ds[base_var] + dims = list(da.dims) + + # Must be able to interpolate along x/y dims + x_dim = x_name if x_name in dims else None + y_dim = y_name if y_name in dims else None + if x_dim is None or y_dim is None: + ds.close() + raise ValueError( + f"Bilinear requires x/y to be dims of {base_var!r}.\n" + f" Requested x dim: {x_name!r} (is_dim={x_name in dims})\n" + f" Requested y dim: {y_name!r} (is_dim={y_name in dims})\n" + f" Available dims: {dims}" + ) + + # Sort for interpolation stability + ds = _ensure_sorted(ds, y_dim, x_dim) + da = ds[base_var] + dims = list(da.dims) + + if "time" not in dims: + ds.close() + raise ValueError(f"No 'time' dim after decoding for '{base_var}'. dims={dims}") + + # Validate 1D x/y coordinate vectors + gx = np.asarray(ds[x_dim].values) + gy = np.asarray(ds[y_dim].values) + if gx.ndim != 1 or gy.ndim != 1: + ds.close() + raise ValueError( + f"Bilinear method requires 1D coordinate vectors for '{y_dim}' and '{x_dim}'. " + f"Got shapes: {y_dim}={gy.shape}, {x_dim}={gx.shape}." + ) + + # Handle extra dims (pressure level, ensemble, expver, etc.) + extra_dims = [d for d in dims if d not in ("time", y_dim, x_dim)] + if extra_dims: + sel = {} + for d in extra_dims: + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) + else: + sel[d] = 0 + da = da.isel(**sel).squeeze() # -> (time, y, x) + + # --- CRS inference + projection lon/lat -> x/y ------------------------- + if file_path not in crs_cache: + # Prefer variable-specific grid_mapping lookup by passing base_var + crs_cache[file_path] = read_crs_from_cf(ds, var_name=base_var) + + target_crs = crs_cache[file_path] + x_pts, y_pts = project_tracks_lonlat_to_xy(lon, lat, target_crs=target_crs) + + # Store for QA + out["x"] = x_pts + out["y"] = y_pts + + # --- vectorized xarray interpolation ----------------------------------- + pts = xr.Dataset( + coords={"points": np.arange(len(out))}, + data_vars={ + "time": ("points", tgt_t), + x_dim: ("points", x_pts), + y_dim: ("points", y_pts), + }, + ) + + sampled = da.interp({x_dim: pts[x_dim], y_dim: pts[y_dim], "time": pts["time"]}) + out[label] = sampled.to_numpy() + + ds.close() + + except Exception as e: + print(f"[ERROR] {label}: {e}") + continue + + # If you want: geometry in projected CRS (x,y). Comment out if not needed. + out["geometry"] = [Point(x, y) for x, y in zip(out["x"], out["y"])] + + return out, pd.NaT, pd.NaT + +def read_crs_from_cf(ds: xr.Dataset, var_name: str | None = None) -> CRS: + """ + Infer the projected coordinate reference system (CRS) of a gridded + environmental dataset using CF-convention metadata. + + The function attempts, in order: + 1) to read a CF-compliant ``grid_mapping`` attribute from a data variable, + 2) to construct a CRS from global dataset attributes (e.g. WKT or PROJ), + 3) to read CRS information from a standalone ``crs`` variable. + + This is intended for datasets on projected grids (e.g. NARR, ERA5-Land, + regional climate models) where track data in WGS84 lon/lat must be + transformed to native x/y coordinates before spatial interpolation. + + Parameters + ---------- + ds : xarray.Dataset + Environmental dataset containing projected horizontal coordinates + and CF-compliant projection metadata. + var_name : str or None, optional + Name of a data variable whose ``grid_mapping`` attribute should be + inspected first. If None, variable-specific metadata are skipped. + + Returns + ------- + pyproj.CRS + Coordinate reference system describing the dataset's native + horizontal projection. + + Raises + ------ + ValueError + If no usable CRS information can be inferred from the dataset. + """ + + # 1) If a data variable is given, try its grid_mapping attribute + grid_mapping_name = None + if var_name is not None and var_name in ds: + grid_mapping_name = ds[var_name].attrs.get("grid_mapping") + + # 2) If we have a grid mapping variable, parse it as CF + if grid_mapping_name and grid_mapping_name in ds.variables: + gm = ds[grid_mapping_name] + # xarray keeps attrs as dict; pyproj can build CRS from CF dict + try: + return CRS.from_cf(gm.attrs) + except Exception: + pass + + # 3) Common alternate places: global attrs + # Try "crs_wkt", "spatial_ref" (GDAL), "proj4", "proj" + for key in ("crs_wkt", "spatial_ref", "proj_wkt", "wkt"): + wkt = ds.attrs.get(key) + if isinstance(wkt, str) and wkt.strip(): + return CRS.from_wkt(wkt) + + for key in ("proj4", "proj4text", "proj", "projection"): + proj = ds.attrs.get(key) + if isinstance(proj, str) and proj.strip(): + return CRS.from_string(proj) + + # 4) Sometimes there is a standalone "crs" variable with WKT in attrs + if "crs" in ds.variables: + crs_var = ds["crs"] + for key in ("crs_wkt", "spatial_ref"): + wkt = crs_var.attrs.get(key) + if isinstance(wkt, str) and wkt.strip(): + return CRS.from_wkt(wkt) + # Or CF attrs + try: + return CRS.from_cf(crs_var.attrs) + except Exception: + pass + + raise ValueError("Could not infer CRS from dataset (no usable CF grid_mapping / WKT / proj string found).") + + +def project_tracks_lonlat_to_xy( + lon: np.ndarray, + lat: np.ndarray, + target_crs: CRS, +) -> tuple[np.ndarray, np.ndarray]: + """ + Project track locations from geographic coordinates (longitude, latitude) + to the native x/y coordinate system of a projected environmental grid. + + This function is used to transform animal tracking locations + (WGS84 lon/lat) into the coordinate system of gridded datasets such as + NARR before spatial interpolation using xarray. + + Parameters + ---------- + lon : array-like + Longitudes of track locations in degrees east (EPSG:4326). + lat : array-like + Latitudes of track locations in degrees north (EPSG:4326). + target_crs : pyproj.CRS + Target projected CRS describing the environmental dataset grid. + + Returns + ------- + x : numpy.ndarray + Projected x-coordinates of track locations in the target CRS. + y : numpy.ndarray + Projected y-coordinates of track locations in the target CRS. + """ + + lon = np.asarray(lon, dtype=float) + lat = np.asarray(lat, dtype=float) + + transformer = Transformer.from_crs("EPSG:4326", target_crs, always_xy=True) + x, y = transformer.transform(lon, lat) + return np.asarray(x, dtype=float), np.asarray(y, dtype=float) + + +def _safe_remove_existing_file(path, retries: int = 5, delay: float = 0.5): + """ + Remove an existing file before overwriting it. + + This is mainly needed on Windows, where NetCDF files can remain locked + for a short time after being opened by xarray/netCDF4/h5netcdf. + """ + path = Path(path) + + if not path.exists(): + return + + last_error = None + + for _ in range(retries): + try: + gc.collect() + path.unlink() + return + except PermissionError as e: + last_error = e + time.sleep(delay) + + raise PermissionError( + f"Could not remove existing file because it is still locked: {path}. " + f"Close any open dataset/viewer using this file and try again. " + f"Original error: {last_error}" + ) + +def convert_tif_to_nc_before_annotation(tif_paths, output_dir): + """ + Converts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. + For each variable, builds a data(time, lat, lon) array. + Returns the path to the generated .nc file. + """ + tif_paths = [str(Path(p)) for p in tif_paths] + if not tif_paths: + raise ValueError("No .tif files provided") + + # 1) Group files by variable + by_var = {} + for tif in tif_paths: + vname = parse_appeears_variable_name(tif) + by_var.setdefault(vname, []).append(tif) + + lat = lon = None + data_vars = {} + + for vname, files in by_var.items(): + times = [] + planes = [] + first_geo = True + + for tif in sorted(files): + tif_name = Path(tif).name + t = parse_time_from_filename(tif_name) + times.append(t) + + with rasterio.open(tif) as src: + arr = src.read(1).astype("float32") + nodata = src.nodata + if nodata is not None: + arr = np.where(arr == nodata, np.nan, arr) + + # IMPORTANT: + # Do not apply scale_factor / add_offset during TIF -> NetCDF conversion. + # The NetCDF stores raw raster values. + # + # Optional scale/offset correction is applied later after sampling, + # and only to user-selected continuous variables. + # + # This avoids corrupting categorical/QC layers such as masks, flags, + # land-cover classes, or quality codes. + + planes.append(arr) + + if first_geo: + transform = src.transform + h, w = src.height, src.width + lon = np.array([transform * (i, 0) for i in range(w)])[:, 0] + lat = np.array([transform * (0, j) for j in range(h)])[:, 1] + first_geo = False + + data_array = np.stack(planes) # (time, lat, lon) + time_index = np.array(times) + + da = xr.DataArray( + data_array, + dims=["time", "lat", "lon"], + coords={"time": time_index, "lat": lat, "lon": lon}, + name=vname + ) + data_vars[vname] = da + + ds = xr.Dataset(data_vars) + base = Path(tif_paths[0]).name.split("_")[0] + safe_base = re.sub(r"[^\w\-]", "_", base) + out = Path(output_dir) / f"{safe_base}_nc_output.nc" + _safe_remove_existing_file(out) + + try: + ds.to_netcdf(out) + finally: + try: + ds.close() + except Exception: + pass + + return str(out) + + +def parse_time_from_filename(filename): + """ + Example: MOD13A1.061__500m_16_days_NDVI_doy2014145000000_aid0001.tif + Parses date using "doyYYYYDDD", where DDD is the day of year. + """ + match = re.search(r'doy(\d{4})(\d{3})', filename) + if match: + year, doy = int(match.group(1)), int(match.group(2)) + return datetime.strptime(f"{year}{doy}", "%Y%j") + else: + raise ValueError(f"Cannot parse time from filename: {filename}") + + +# --- AppEEARS variable-name parser --- # +def parse_appeears_variable_name(tif_path: str) -> str: + """ + Returns the variable/layer name for an AppEEARS GeoTIFF. + Order: + (A) try reading tags (long_name, DESCRIPTION, Layer...) + (B) if not available — parse the filename: + - token before 'doyYYYYDDD' (typical: ..._NDVI_doy2014145_...) + - or one of the known tokens in KNOWN_TOKENS + (C) fallback -> "data" + """ + p = Path(tif_path) + name = p.name + + # A) read TIF tags + try: + with rasterio.open(tif_path) as src: + tags = src.tags() + for key in ("long_name", "DESCRIPTION", "Description", "Layer", "LAYER", "BAND_NAME"): + if key in tags and str(tags[key]).strip(): + raw = str(tags[key]).strip() + var = re.sub(r"[^\w\-]+", "_", raw) + return var + except Exception: + pass + + # B1) token before "doyYYYYDDD" + m = re.search(r"_([A-Za-z0-9][A-Za-z0-9_]+)_doy\d{7}", name) + if m: + return m.group(1) + + # B2) known tokens (common AppEEARS layers; list is incomplete but useful) + KNOWN_TOKENS = { + "NDVI", "EVI", + "LST_Day_1km", "LST_Night_1km", "LST_Day_1KM", "LST_Night_1KM", "QC_Day", "QC_Night", + "Lai_500m", "Fpar_500m", "FparLai_QC", + "Nadir_Reflectance_Band1", "Nadir_Reflectance_Band2", "Nadir_Reflectance_Band3", + "Nadir_Reflectance_Band4", "Nadir_Reflectance_Band5", "Nadir_Reflectance_Band6", + "Nadir_Reflectance_Band7", + "SurfReflect_Band1", "SurfReflect_Band2", "SurfReflect_Band3", + "SurfReflect_Band4", "SurfReflect_Band5", "SurfReflect_Band6", "SurfReflect_Band7", + "NDSI_Snow_Cover", + "VIIRS_NDVI", "VIIRS_EVI", + "BurnDate", "BurnDate_Uncertainty", "LAI", "FPAR", "QC" + } + candidates = sorted([t for t in KNOWN_TOKENS if t in name], key=len, reverse=True) + if candidates: + return candidates[0] + + parts = re.split(r"[_.]", name) + parts = [t for t in parts if t and t.lower() != "tif"] + parts = [t for t in parts if not t.lower().startswith("aid")] + parts = [t for t in parts if not re.fullmatch(r"\d{7,8}", t) and not t.startswith("doy")] + if parts: + parts.sort(key=len, reverse=True) + return parts[0] + + return "data" + + +def _ensure_sorted(ds, lat_dim, lon_dim): + if (np.diff(ds[lat_dim].values) < 0).all(): + ds = ds.sortby(lat_dim) + if (np.diff(ds[lon_dim].values) < 0).all(): + ds = ds.sortby(lon_dim) + return ds + + +def _nearest_index(arr, x): + # array arr growing: fast via searchsorted + local check + idx = np.searchsorted(arr, x) + if idx == 0: + return 0 + if idx >= len(arr): + return len(arr) - 1 + return idx if abs(arr[idx] - x) < abs(arr[idx-1] - x) else idx-1 + + +def _k_nearest_indices(glat, glon, xlat, xlon, k): + """Returns an array of indices (ilat, ilon) of length k among candidates from the local window""" + # first the shortest path is the nearest grid + i0 = _nearest_index(glat, xlat) + j0 = _nearest_index(glon, xlon) + + # form a small window around (i0, j0) sufficient to find k neighbors + # empirically: radius r = ceil(max(1, sqrt(k))) → (2r+1)^2 >= k + r = int(np.ceil(max(1, np.sqrt(k)))) + i_min, i_max = max(0, i0 - r), min(len(glat) - 1, i0 + r) + j_min, j_max = max(0, j0 - r), min(len(glon) - 1, j0 + r) + + # collect candidates in the window + cand = [] + for ii in range(i_min, i_max + 1): + for jj in range(j_min, j_max + 1): + d = np.hypot(glat[ii] - xlat, glon[jj] - xlon) + cand.append((d, ii, jj)) + cand.sort(key=lambda t: t[0]) + top = cand[:k] + return [(ii, jj) for _, ii, jj in top] + + +def _idw(values, distances, p=2): + """IDW average for already interpolated values. distances > 0 (add eps).""" + vals = np.array(values, dtype=float) + d = np.array(distances, dtype=float) + 1e-12 + w = 1.0 / (d ** p) + # ignore NaN in vals + mask = ~np.isnan(vals) + if not mask.any(): + return np.nan + w_sel = w[mask] + v_sel = vals[mask] + return np.sum(w_sel * v_sel) / np.sum(w_sel) + + +def _detect_time_name(ds): + # 1)quick candidates by name + name_candidates = ("time","valid_time","forecast_time","verification_time","t","Time","datetime","date") + for c in name_candidates: + if c in ds.coords or c in ds.variables: + return c + + # 2) CF attributes: standard_name = "time" or units with the word "since" + for name, var in ds.variables.items(): + stdn = str(var.attrs.get("standard_name","")).lower() + units = str(var.attrs.get("units","")) + if stdn == "time": + return name + if "since" in units: + return name + return None + + +def _split_var_and_level(label: str): + """ + If the name is in the format _, returns ('var', target_level_float). + Otherwise ('label', None). + """ + m = re.match(r"^([A-Za-z_]\w*)_(\d{2,4})$", str(label)) + if m: + base = m.group(1) + try: + lvl = float(m.group(2)) + except Exception: + lvl = None + return base, lvl + return label, None + + +def _pick_level_index(ds, level_dim: str, target_level: float | None): + """ + Returns the level index: + - if target_level is given, the closest to it; + - otherwise, the closest to 1000 hPa; + - if error, 0. + """ + try: + vals = np.asarray(ds[level_dim].values, dtype=float) + if vals.size == 0: + return 0 + ref = 1000.0 if target_level is None else float(target_level) + return int(np.nanargmin(np.abs(vals - ref))) + except Exception: + return 0 diff --git a/ecodata/app/apps/__init__.py b/ecodata/app/apps/__init__.py index 253765d..b3cb357 100644 --- a/ecodata/app/apps/__init__.py +++ b/ecodata/app/apps/__init__.py @@ -2,4 +2,9 @@ import ecodata.app.apps.movie_maker_app # noqa import ecodata.app.apps.subsetter_app # noqa import ecodata.app.apps.tracks_explorer_app # noqa +import ecodata.app.apps.annotation_engine_app # noqa +import ecodata.app.apps.presence_data_preparation_app # noqa +import ecodata.app.apps.nc_builder_app # noqa +#import ecodata.app.apps.height_sampler_app +import ecodata.app.apps.multidimensional_annotation_app # noqa from ecodata.panel_utils import applications # noqa diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py new file mode 100644 index 0000000..9820d7b --- /dev/null +++ b/ecodata/app/apps/annotation_engine_app.py @@ -0,0 +1,1954 @@ +import logging +from pathlib import Path +import panel as pn +import param +import pandas as pd +import xarray as xr +from panel.io.loading import start_loading_spinner, stop_loading_spinner +from ecodata.app.models import FileSelector +from ecodata.panel_utils import param_widget, register_view, try_catch, rename_param_widgets +from ecodata.app.config import DEFAULT_TEMPLATE +from datetime import datetime +import re +from ecodata import validate_and_process_csv, load_vector_extent_info, load_taxa_and_ids_from_csv +from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only, delete_files +from ecodata.annotation_eng_func import ( + start_annotation_process, + convert_tif_to_nc_before_annotation, + get_nc_bounds, + open_nc_metadata, + detect_env_coord_names, + safe_open_nc_with_time_decoding, +) + +logger = logging.getLogger(__file__) + +class movebank_annotation_engine(param.Parameterized): + local_ID_file = param_widget(FileSelector(constrain_path=False, expanded=True, size=10)) + load_data_button = param_widget(pn.widgets.Button(name="Load data", button_type="primary")) + taxon_name_val = param_widget( + pn.widgets.MultiSelect(name="Taxon name (use Ctrl or ⌘ for multiple selection)", options=[], height = 140, disabled=True) + ) + individual_ID = param_widget( + pn.widgets.MultiSelect(name="Individual ID (use Ctrl or ⌘ for multiple selection)", options=[], height = 140, disabled=True) + ) + simple_interp_button = param_widget(pn.widgets.Button(name="Simple interpolation (missing ≤ 1 day)", button_type="primary")) + deployment_time_gap = param_widget( + pn.widgets.IntInput(name="Deployment time gap (minutes)", value=60, step=60, start=0) + ) + min_expected_obs = param_widget( + pn.widgets.IntInput(name="Minimum expected number of observations(per deployment)", value=100, step=50, start=10) + ) + + time_selection_ID = param_widget( + pn.widgets.DatetimeRangeSlider( + name="Select Time Range", + start=datetime(2010, 1, 1), + end=datetime(2025, 12, 31), + value=(datetime(2016, 6, 13), datetime(2016, 6, 14)), + step=2_592_000_000 + ) + ) + time_interval = param_widget(pn.widgets.IntInput(name="Timestep for Interpolation/Averaging (minutes)", value=30, step=1, start=1)) + start_from_midnight = param_widget(pn.widgets.Checkbox(name="First timestamp = 00:00:00", value=False)) + out_csv_name = param_widget(pn.widgets.TextInput(name="Output CSV", value=str(Path.home() / "Downloads" / "subset.csv"))) + make_csv = param_widget(pn.widgets.Button(name="Make CSV", button_type="primary")) + merge_files = param_widget(pn.widgets.Checkbox(name="Merge files after processing", value=False)) + delete_individual_ID_files = param_widget(pn.widgets.Checkbox(name="Delete individual files after merge", value=True)) + + folder_to_merge = param_widget(pn.widgets.TextInput(name="Folder with CSV files to merge (select folder)", value=str(Path.home() / "Downloads"))) + delete_empty_columns = param_widget(pn.widgets.Checkbox(name="Delete empty columns after merging", value=False)) + out_merged_csv_name = param_widget(pn.widgets.TextInput(name="Output merged CSV", value=str(Path.home() / "Downloads" / "merged.csv"))) + merge_files_button = param_widget(pn.widgets.Button(name="Merge files in folder", button_type="primary")) + + # === Annotation Engine widgets === + env_data_selector = param_widget( + FileSelector( + name="Environmental data (.nc)", + constrain_path=False, + expanded=True, + size=10 + ) + ) + bound_data_selector = param_widget(FileSelector(name="Boundary data (.shp)", constrain_path=False, expanded=True, size=10)) + movement_data_selector = param_widget(FileSelector(name="Movebank data (.csv)", constrain_path=False, expanded=True, size=10)) + load_env_button = pn.widgets.Button(name="Load environmental data", button_type="primary") + load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary") + load_bound_button = pn.widgets.Button(name="Load boundary data", button_type="primary") + reset_bound_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary") + nc_time_var = pn.widgets.Select(name="Time variable", options=[], value=None) + nc_lat_var = pn.widgets.Select(name="Latitude variable", options=[], value=None) + nc_lon_var = pn.widgets.Select(name="Longitude variable", options=[], value=None) + env_spatial_mode = pn.widgets.RadioButtonGroup( + name="Env spatial coordinate mode", + options=["Geographic (lat/lon)", "Projected (x/y)"], + value="Geographic (lat/lon)", + button_type="default", + ) + env_x_select = pn.widgets.Select(name="X coordinate", options=[], value=None) + env_y_select = pn.widgets.Select(name="Y coordinate", options=[], value=None) + env_continuous_selector = pn.widgets.MultiSelect( + name="Continuous (use Ctrl or ⌘ for multiple selection)", + options=[], value=[], height=180 + ) + + env_categorical_selector = pn.widgets.MultiSelect( + name="Categorical (use Ctrl or ⌘ for multiple selection)", + options=[], value=[], height=180 + ) + + taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl or ⌘ for multiple)", height = 140) + id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl or ⌘ for multiple)", height = 140) + env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + movement_info = pn.pane.HTML("File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + control_smoothing = pn.widgets.Select( + name="Number of nearest grid points", + options=["2", "4", "6", "8"], + value="4" + ) + output_path = pn.widgets.TextInput(name="Output path", value=str(Path.home() / "Downloads" / "annotated_env.csv")) + boundary_info_str = pn.pane.HTML( + "Boundary file: not selected
Spatial range: = environment data boundary", + name="", + styles={"white-space": "pre-wrap"}, + sizing_mode="stretch_width" + ) + interpolation_method = pn.widgets.Select( + name="Interpolation method (spatial)", + options=[ + "Nearest neighbor (time-linear)", + "Inverse Distance Weighting (time-linear)", + "Bilinear (projected x/y, time-linear)", + ], + value="Inverse Distance Weighting (time-linear)" + ) + make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") + + + status_text = param.String("Ready...") + #TIF widgets + # === TIF Annotation Engine widgets === + tif_env_data_selector = param_widget( + FileSelector( + name="Select any .tif file in folder", + constrain_path=False, + expanded=True, + size=10 + ) + ) + tif_movement_data_selector = param_widget(FileSelector(name="Movebank data", constrain_path=False, expanded=True,size=10)) + tif_bound_data_selector = param_widget(FileSelector(name="Boundary data", constrain_path=False, expanded=True, size=10)) + + tif_load_env_button = pn.widgets.Button(name="Load TIF environmental data", button_type="primary") + tif_load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary") + tif_load_bound_button = pn.widgets.Button(name="Load boundary data", button_type="primary") + tif_reset_bound_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary") + tif_control_smoothing = pn.widgets.Select( + name="Number of nearest grid points", + options=["2", "4", "6", "8"], + value="4" + ) + tif_env_data_multiselect = pn.widgets.MultiSelect(name="Environmental variables (use Ctrl or ⌘ for multiple)", options=[], height = 140) + # TIF variable type: continuous vs categorical + tif_continuous_vars = pn.widgets.MultiSelect(name="Continuous variables (use Ctrl or ⌘ for multiple)", options=[], value=[], size=8) + tif_categorical_vars = pn.widgets.MultiSelect(name="Categorical/QC variables (use Ctrl or ⌘ for multiple)", options=[], value=[], size=8) + # prevent recursive watcher updates + _syncing_tif_var_types = False + tif_taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl or ⌘ for multiple)", height = 140) + tif_id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl or ⌘ for multiple)", height = 140) + tif_env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + tif_movement_info = pn.pane.HTML("File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + tif_output_path = pn.widgets.TextInput(name="Output path", value=str(Path.home() / "Downloads" / "annotated_env_tif.csv")) + tif_boundary_info_str = pn.pane.HTML( + "Boundary file: not selected
Spatial range: = environment data boundary", + sizing_mode="stretch_width" + ) + # --- TIF scaling (optional) --- + tif_apply_scale = pn.widgets.Checkbox(name="Apply scale factor / offset", value=False) + tif_scale_factor = pn.widgets.FloatInput( + name="Scale factor", value=1.0, step=0.0001, start=None, disabled=True + ) + tif_add_offset = pn.widgets.FloatInput( + name="Add offset", value=0.0, step=0.1, start=None, disabled=True + ) + + tif_interpolation_method = pn.widgets.Select( + name="Interpolation method (spatial)", + options=["Nearest neighbor (time-linear)", "Inverse Distance Weighting (time-linear)"], + value="Inverse Distance Weighting (time-linear)" + ) + tif_make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") + + + def __init__(self, **params): + super().__init__(**params) + + self.interpolation_method.name = "Spatial interpolation method (.nc)" + self.tif_interpolation_method.name = "Spatial interpolation method (.tif)" + self._wire_env_split_guards() + self._apply_env_selector_labels() + rename_param_widgets( + self, + [ + "local_ID_file", "load_data_button", + "taxon_name_val", "individual_ID", "simple_interp_button", + "deployment_time_gap", "min_expected_obs", + "time_selection_ID", "time_interval", + "start_from_midnight", "out_csv_name", + "make_csv", "merge_files", + "delete_individual_ID_files","folder_to_merge", + "delete_empty_columns", "out_merged_csv_name", + "merge_files_button", + # === NC Annotation tab === + "env_data_selector", + "bound_data_selector", "movement_data_selector", + "load_env_button", "load_bound_button", "reset_bound_button", + "load_movement_button", "env_continuous_selector", "env_categorical_selector", + "taxon_multiselect", "id_multiselect", + "boundary_info_str", "interpolation_method", + "control_smoothing", + "env_info", "movement_info" ,"output_path", + "make_annotation_button", + "nc_time_var", "nc_lat_var","nc_lon_var", + "env_spatial_mode", "env_x_select", "env_y_select", + # === TIF Annotation tab === + "tif_env_data_selector", + "tif_movement_data_selector", + "tif_bound_data_selector", "tif_reset_bound_button", + "tif_env_data_multiselect", + "tif_continuous_vars", "tif_categorical_vars", + "tif_taxon_multiselect", + "tif_id_multiselect", + "tif_interpolation_method", "tif_control_smoothing", + "tif_apply_scale", "tif_scale_factor", "tif_add_offset", + "tif_env_info", "tif_movement_info", + "tif_make_annotation_button" + ] + ) + + self.df = None + self.alert = pn.pane.Markdown(self.status_text) + NC_H = 1080 + # === NC tab === + self._nc_col1 = self._section( + "1. Environmental data (.nc)", + pn.Column(self.env_data_selector, sizing_mode="stretch_width"), + self.load_env_button, + self.env_continuous_selector, + self.env_categorical_selector, + self.env_info, + self.env_spatial_mode, + self.nc_time_var, + self.nc_lat_var, + self.nc_lon_var, + self.env_x_select, + self.env_y_select, + self.interpolation_method, + self.control_smoothing, + self.output_path, + height=NC_H + 400, + ) + self._nc_col2 = self._section( + "2. Movebank data (.csv)", + pn.Column(self.movement_data_selector, sizing_mode="stretch_width"), + self.load_movement_button, + self.taxon_multiselect, + self.id_multiselect, + self.movement_info, + height=NC_H + 400, + ) + self._nc_col3 = self._section( + "3. Boundary data (.shp/.geojson)", + pn.Column(self.bound_data_selector, sizing_mode="stretch_width"), + pn.Row(self.load_bound_button, self.reset_bound_button), + self.boundary_info_str, + pn.layout.Divider(), + pn.pane.Markdown("### 4. Start annotation"), + self.make_annotation_button, + height=NC_H + 400, + ) + + # synchronize heights after rendering + pn.state.onload(self._sync_nc_column_heights) + + self.anotation_engine_tab = pn.Column( + pn.pane.Markdown("### Annotation engine - .nc", sizing_mode="stretch_width"), + pn.GridBox( + self._nc_col1, self._nc_col2, self._nc_col3, + ncols=3, sizing_mode="stretch_width", + height=1400, + scroll=True, + ), + ) + + # TIF + TIF_H = 1800 + self._tif_col1 = self._section( + "1. Environmental data (.tif) - select one (of)", + pn.Column(self.tif_env_data_selector, sizing_mode="stretch_width"), + self.tif_load_env_button, + self.tif_continuous_vars, + self.tif_categorical_vars, + + pn.layout.Divider(), + self.tif_env_info, + self.tif_interpolation_method, + self.tif_control_smoothing, + self.tif_output_path, + pn.pane.Markdown("### Post-sampling correction for continuous variables"), + self.tif_apply_scale, + self.tif_scale_factor, + self.tif_add_offset, + height=TIF_H, + ) + + self._tif_col2 = self._section( + "2. Movebank data (.csv)", + pn.Column(self.tif_movement_data_selector, sizing_mode="stretch_width"), + self.tif_load_movement_button, + self.tif_taxon_multiselect, + self.tif_id_multiselect, + self.tif_movement_info, + height=TIF_H, + ) + + self._tif_col3 = self._section( + "3. Boundary data (.shp/.geojson)", + pn.Column(self.tif_bound_data_selector, sizing_mode="stretch_width"), + pn.Row(self.tif_load_bound_button, self.tif_reset_bound_button), + self.tif_boundary_info_str, + pn.layout.Divider(), + pn.pane.Markdown("### 4. Start annotation"), + self.tif_make_annotation_button, + height=TIF_H, + ) + + self.anotation_engine_tif_tab = pn.Column( + pn.pane.Markdown("### Annotation engine - .tif", sizing_mode="stretch_width"), + pn.GridBox( + self._tif_col1, self._tif_col2, self._tif_col3, + ncols=3, + sizing_mode="stretch_width", + ), + ) + + self.crop_interpolate_tab = pn.Column( + pn.pane.Markdown("### Crop files"), + self.local_ID_file, + self.load_data_button, + pn.Row( + self.taxon_name_val, + self.individual_ID, + ), + self.simple_interp_button, + pn.Column(self.deployment_time_gap, self.min_expected_obs), + self.time_selection_ID, + pn.Row(self.time_interval, self.start_from_midnight), + self.out_csv_name, + self.make_csv, + self.merge_files, + self.delete_individual_ID_files, + self.alert + ) + + self.merge_tab = pn.Column( + pn.pane.Markdown("### Merge files (Please select a **folder** with CSV files)"), + self.folder_to_merge, + self.delete_empty_columns, + self.out_merged_csv_name, + self.merge_files_button, + ) + + self.view = pn.Tabs( + ("Annotation engine - .nc", self.anotation_engine_tab), + ("Annotation engine - .tif", self.anotation_engine_tif_tab), + ("Crop & interpolate csv", self.crop_interpolate_tab), + ("Merge csv", self.merge_tab), + ) + + self.simple_interp_button.on_click(self.run_interpolate_missing_only) + self.load_data_button.on_click(self.load_ids_from_file) + self.make_csv.on_click(self.run_make_csv) + self.merge_files_button.on_click(self.run_merge_files) + self.taxon_name_val.param.watch(self.update_individual_ids_by_taxon, 'value') + self.load_env_button.on_click(self.load_env_data) + self.load_bound_button.on_click(self.load_boundary_data) + self.reset_bound_button.on_click(self.reset_boundary_data) + self.load_movement_button.on_click(self.load_movement_data) + self.taxon_multiselect.param.watch(self.update_annotation_ids_by_taxon, 'value') + self.make_annotation_button.on_click(self.run_annotation) + self.env_continuous_selector.param.watch(lambda e: self.update_env_info_text(self._get_selected_env_vars()), "value") + self.env_categorical_selector.param.watch(lambda e: self.update_env_info_text(self._get_selected_env_vars()), "value") + self.taxon_multiselect.param.watch(lambda e: self.update_movement_info_text("Taxons", e.new), "value") + self.id_multiselect.param.watch(lambda e: self.update_movement_info_text("IDs", e.new), "value") + self.interpolation_method.param.watch(self._update_smoothing_options, 'value') + self.env_spatial_mode.param.watch(self._apply_env_spatial_mode, "value") + self._apply_env_spatial_mode() + ######TIF on click + self.tif_load_env_button.on_click(self.load_env_data_tif) + self.tif_load_bound_button.on_click(self.load_boundary_data_tif) + self.tif_reset_bound_button.on_click(self.reset_boundary_data) + self.tif_load_movement_button.on_click(self.load_movement_data_tif) + self.tif_make_annotation_button.on_click(self.run_annotation_tif) + self.tif_taxon_multiselect.param.watch(self.update_annotation_ids_by_taxon_tif, 'value') + self.tif_continuous_vars.param.watch( + lambda e: self.update_env_info_text_tif( + list(self.tif_continuous_vars.value or []) + [ + v for v in list(self.tif_categorical_vars.value or []) + if v not in list(self.tif_continuous_vars.value or []) + ] + ), + "value" + ) + self.tif_categorical_vars.param.watch( + lambda e: self.update_env_info_text_tif( + list(self.tif_continuous_vars.value or []) + [ + v for v in list(self.tif_categorical_vars.value or []) + if v not in list(self.tif_continuous_vars.value or []) + ] + ), + "value" + ) + self.tif_taxon_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("Taxons", e.new), "value") + self.tif_id_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("IDs", e.new), "value") + self.tif_interpolation_method.param.watch(self._update_smoothing_options_tif, 'value') + self.tif_apply_scale.param.watch(self._update_tif_scale_widgets, "value") + self._update_tif_scale_widgets() + self.tif_continuous_vars.param.watch(self._sync_tif_variable_type_selection, "value") + self.tif_categorical_vars.param.watch(self._sync_tif_variable_type_selection, "value") + + + @try_catch("Error loading Individual IDs") + def load_ids_from_file(self, *events): + self.status_text = "Loading IDs..." + self.alert.object = self.status_text + file_path = self.local_ID_file.value + + if not file_path: + self.status_text = "No file selected." + self.alert.object = self.status_text + return + + try: + df = pd.read_csv(file_path) + df.columns = [re.sub(r"[-._\s]+", "_", col.lower()) for col in df.columns] # normalize + self.df = df + self._set_time_slider_from_df(df) + unique_ids = sorted(df["individual_local_identifier"].dropna().astype(str).unique()) + self.individual_ID.options = list(unique_ids) + self.individual_ID.disabled = False + + if "individual_taxon_canonical_name" in df.columns: + unique_taxa = sorted(df["individual_taxon_canonical_name"].dropna().astype(str).unique()) + self.taxon_name_val.options = list(unique_taxa) + self.taxon_name_val.disabled = False + self.status_text = f"Loaded {len(unique_ids)} Individual IDs and {len(unique_taxa)} Taxon names." + else: + self.status_text = f"Loaded {len(unique_ids)} Individual IDs. Column 'individual_taxon_canonical_name' not found." + + except Exception as e: + logger.exception("Error loading IDs") + self.status_text = f"Error: {e}" + + self.alert.object = self.status_text + + def update_individual_ids_by_taxon(self, event): + if self.df is None: + return + + selected_taxa = event.new + + if not selected_taxa: + unique_ids = sorted(self.df["individual_local_identifier"].dropna().astype(str).unique()) + self.individual_ID.options = list(unique_ids) + self.individual_ID.value = [] + else: + filtered_df = self.df[self.df["individual_taxon_canonical_name"].isin(selected_taxa)] + unique_ids = sorted(filtered_df["individual_local_identifier"].dropna().astype(str).unique()) + self.individual_ID.options = list(unique_ids) + self.individual_ID.value = list(unique_ids) + + def update_annotation_ids_by_taxon(self, event): + if self.df is None: + return + + selected_taxa = event.new + if not selected_taxa: + ids = sorted(self.df["individual_local_identifier"].dropna().astype(str).unique()) + else: + filtered = self.df[self.df["individual_taxon_canonical_name"].isin(selected_taxa)] + ids = sorted(filtered["individual_local_identifier"].dropna().astype(str).unique()) + + self.id_multiselect.options = ids + self.id_multiselect.value = ids + + + @try_catch("Error generating CSV") + def run_make_csv(self, *events): + try: + individual_ids = self.individual_ID.value + csv_path = Path(self.local_ID_file.value) + interval_minutes = int(self.time_interval.value) + + start_time, end_time = self.time_selection_ID.value + start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S.%f") if not isinstance(start_time, str) else start_time + end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S.%f") if not isinstance(end_time, str) else end_time + + out_csv = self.out_csv_name.value + columns = validate_and_process_csv(csv_path) + + output_files = generate_individual_csvs_for_local_ids( + csv_file=csv_path, + ids=individual_ids, + start_time=start_time_str, + end_time=end_time_str, + interval_minutes=interval_minutes, + output_path_template=out_csv, + columns_to_interpolate=columns, + deployment_time_gap=int(self.deployment_time_gap.value), + min_expected_obs=int(self.min_expected_obs.value), + start_from_midnight=bool(self.start_from_midnight.value) + ) + + if self.merge_files.value: + merged_df = pd.concat([pd.read_csv(f) for f in output_files], ignore_index=True) + merged_output_path = out_csv.replace(".csv", "_merged.csv") + merged_df.to_csv(merged_output_path, index=False) + + if self.delete_individual_ID_files.value: + for f in output_files: + try: + Path(f).unlink() + except Exception as e: + logger.warning(f"Failed to delete {f}: {e}") + + self.status_text = f"Processing complete. Output saved to: {Path(out_csv).parent}" + except Exception as e: + logger.exception("Failed to generate CSV") + self.status_text = f"Failed: {e}" + + self.alert.object = self.status_text + + def _set_time_slider_from_df(self, df: pd.DataFrame): + # time column after name normalization + candidates = ("timestamp", "eobs_start_timestamp", "time", "datetime", "date") + time_col = next((c for c in candidates if c in df.columns), None) + if not time_col: + return + + ts = pd.to_datetime(df[time_col], errors="coerce") + ts = ts[ts.notna()] + if ts.empty: + return + + tmin = pd.Timestamp(ts.min()).to_pydatetime() + tmax = pd.Timestamp(ts.max()).to_pydatetime() + + # update the slider limits and values + self.time_selection_ID.start = tmin + self.time_selection_ID.end = tmax + self.time_selection_ID.value = (tmin, tmax) + + + @try_catch("Error merging files from folder") + def run_merge_files(self, *events): + try: + folder_path = Path(self.folder_to_merge.value) + merged_df, deleted_columns = merge_csv_files_from_folder(folder_path, self.delete_empty_columns.value) + + merged_output_path = self.out_merged_csv_name.value + merged_df.to_csv(merged_output_path, index=False) + + deleted_msg = f"\nDeleted columns: {', '.join(deleted_columns)}" if deleted_columns else "\nNo columns deleted." + self.status_text = f"Merged CSV saved: {merged_output_path}{deleted_msg}" + except Exception as e: + logger.exception("Failed to merge files") + self.status_text = f"Failed: {e}" + + self.alert.object = self.status_text + + def _is_categorical_var(self, var_name: str, da) -> bool: + """ + Heuristic classification: + - QC/flag/mask/class/category in name -> categorical + - integer dtype + flag_values/flag_meanings attrs -> categorical + - integer dtype + small number of unique values (sample) -> categorical + """ + name = (var_name or "").lower() + name_hits = ["qc", "quality", "flag", "mask", "class", "category", "type", "landcover", "biome"] + if any(h in name for h in name_hits): + return True + + try: + import numpy as np + if np.issubdtype(da.dtype, np.integer): + attrs = getattr(da, "attrs", {}) or {} + if ("flag_values" in attrs) or ("flag_meanings" in attrs): + return True + + # sample uniqueness (avoid loading whole array) + # take first time slice if possible + sample = da + for dim in da.dims: + if dim.lower() in ("time",): + sample = sample.isel({dim: 0}) + break + vals = sample.values + flat = vals.ravel() + flat = flat[:5000] # cap + flat = flat[~np.isnan(flat)] if flat.dtype.kind == "f" else flat + uniq = np.unique(flat) + if len(uniq) <= 32: + return True + except Exception: + pass + + return False + + def _enforce_env_split_unique(self, changed: str, new_values: list): + """ + Ensure the same variable cannot be selected in both selectors. + changed: "cont" or "cat" + """ + cont = list(self.env_continuous_selector.value or []) + cat = list(self.env_categorical_selector.value or []) + + if changed == "cont": + # remove from categorical... + overlap = set(new_values) & set(cat) + if overlap: + self.env_categorical_selector.value = [v for v in cat if v not in overlap] + + elif changed == "cat": + overlap = set(new_values) & set(cont) + if overlap: + self.env_continuous_selector.value = [v for v in cont if v not in overlap] + + + def _wire_env_split_guards(self): + """ + Attach watchers for mutual exclusivity. + Call once in __init__. + """ + self.env_continuous_selector.param.watch( + lambda e: self._enforce_env_split_unique("cont", list(e.new or [])), + "value" + ) + self.env_categorical_selector.param.watch( + lambda e: self._enforce_env_split_unique("cat", list(e.new or [])), + "value" + ) + + + def _normalize_interp_key(self, ui_value: str) -> str: + """ + Convert UI label -> internal key expected by annotation engine. + Returns 'nearest' or 'idw' (fallback: original string). + """ + s = (ui_value or "").strip().lower() + if s.startswith("nearest"): + return "nearest" + if s.startswith("inverse") or "idw" in s: + return "idw" + if "bilinear" in s: + return "bilinear" + return ui_value + + def _apply_env_selector_labels(self): + """Make selector purposes obvious in UI.""" + self.env_continuous_selector.name = "Continuous (use Ctrl or ⌘ for multiple)" + self.env_categorical_selector.name = "Categorical/QC (use Ctrl or ⌘ for multiple)" + + def _apply_env_spatial_mode(self, event=None): + """Enable/disable coordinate selectors depending on selected spatial mode.""" + is_projected = self.env_spatial_mode.value == "Projected (x/y)" + + self.nc_lat_var.disabled = is_projected + self.nc_lon_var.disabled = is_projected + + self.env_x_select.disabled = not is_projected + self.env_y_select.disabled = not is_projected + + @try_catch("Error loading environmental data") + def load_env_data(self, *events): + """We select exactly one .nc, update File/Time/Spatial and the list of 3D variables.""" + self.status_text = "Loading environmental data..." + self.alert.object = self.status_text + + raw = self.env_data_selector.value + if not raw: + self.status_text = "Please select one .nc file." + self.alert.object = self.status_text + return + + # If the selector suddenly returns a list, we require exactly 1 + if isinstance(raw, (list, tuple, set)): + if len(raw) != 1: + self.status_text = "Select exactly one .nc file." + self.alert.object = self.status_text + return + nc_path = str(list(raw)[0]).strip() + else: + nc_path = str(raw).strip() + + if Path(nc_path).suffix.lower() != ".nc": + self.status_text = "Only .nc is supported on this tab." + self.alert.object = self.status_text + return + + # Update "File:" immediately + self._update_info_lines(self.env_info, {"File:": Path(nc_path).name}) + self._auto_height(self.env_info) + + var_file_map: dict[str, str] = {} + time_text = "-" + spatial_text = "-" + + try: + ds = open_nc_metadata(nc_path) + try: + all_vars = sorted(set(ds.coords.keys()) | set(ds.variables.keys())) + coord_guess = detect_env_coord_names(ds) + + # Populate all dropdowns + self.nc_time_var.options = all_vars + self.nc_lat_var.options = all_vars + self.nc_lon_var.options = all_vars + self.env_x_select.options = all_vars + self.env_y_select.options = all_vars + + # Autoselect defaults while preserving valid existing choices. + self.nc_time_var.value = ( + coord_guess.get("env_time") + if coord_guess.get("env_time") in all_vars + else (self.nc_time_var.value if self.nc_time_var.value in all_vars else None) + ) + self.nc_lat_var.value = ( + coord_guess.get("env_lat") + if coord_guess.get("env_lat") in all_vars + else (self.nc_lat_var.value if self.nc_lat_var.value in all_vars else None) + ) + self.nc_lon_var.value = ( + coord_guess.get("env_lon") + if coord_guess.get("env_lon") in all_vars + else (self.nc_lon_var.value if self.nc_lon_var.value in all_vars else None) + ) + self.env_x_select.value = ( + coord_guess.get("env_x") + if coord_guess.get("env_x") in all_vars + else (self.env_x_select.value if self.env_x_select.value in all_vars else None) + ) + self.env_y_select.value = ( + coord_guess.get("env_y") + if coord_guess.get("env_y") in all_vars + else (self.env_y_select.value if self.env_y_select.value in all_vars else None) + ) + + has_latlon = bool(self.nc_lat_var.value and self.nc_lon_var.value) + has_xy = bool(self.env_x_select.value and self.env_y_select.value) + if has_latlon and not has_xy: + self.env_spatial_mode.value = "Geographic (lat/lon)" + elif has_xy and not has_latlon: + self.env_spatial_mode.value = "Projected (x/y)" + + # ---- TIME INFO ---- + time_name = self.nc_time_var.value + if time_name and (time_name in ds.coords or time_name in ds.variables): + try: + decoded_times = xr.decode_cf(ds[[time_name]], decode_times=True)[time_name] + tmin = pd.to_datetime(decoded_times.min().values) + tmax = pd.to_datetime(decoded_times.max().values) + time_text = f"{tmin.date()} — {tmax.date()}" + except Exception: + time_text = "-" + + # ---- SPATIAL INFO ---- + lat_name = self.nc_lat_var.value + lon_name = self.nc_lon_var.value + if lat_name and lat_name in ds and lon_name and lon_name in ds: + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + spatial_text = ( + f"lat[{lat_min:.3f}..{lat_max:.3f}], " + f"lon[{lon_min:.3f}..{lon_max:.3f}]" + ) + else: + x_name = self.env_x_select.value + y_name = self.env_y_select.value + if x_name and x_name in ds and y_name and y_name in ds: + x_min = float(ds[x_name].min()) + x_max = float(ds[x_name].max()) + y_min = float(ds[y_name].min()) + y_max = float(ds[y_name].max()) + spatial_text = ( + f"{y_name}[{y_min:.3f}..{y_max:.3f}], " + f"{x_name}[{x_min:.3f}..{x_max:.3f}]" + ) + + # ---- VARIABLE LIST with vertical level expansion ---- + LEVEL_DIM_CANDIDATES_LOCAL = ( + "isobaricInhPa", "isobaric_in_hPa", "level", + "lev", "plev", "pressure", "pressure_level" + ) + for var in ds.data_vars: + da = ds[var] + if da.ndim < 3: + continue + dims = list(da.dims) + level_dim = next( + (d for d in LEVEL_DIM_CANDIDATES_LOCAL if d in dims), None + ) + if level_dim is None: + var_file_map[var] = nc_path + else: + try: + level_vals = ds[level_dim].values + except Exception: + level_vals = [] + for lv in level_vals: + try: + lv_int = int(round(float(lv))) + var_file_map[f"{var}_{lv_int}"] = nc_path + except Exception: + continue + + finally: + ds.close() + + except Exception as e: + self.status_text = f"Failed to open dataset: {e}" + self.alert.object = self.status_text + return + + # Update Time/Spatial information block + self._update_info_lines(self.env_info, { + "Time range:": time_text, + "Spatial range:": spatial_text, + }) + self._auto_height(self.env_info) + + if not var_file_map: + self.env_continuous_selector.options = [] + self.env_categorical_selector.options = [] + self.env_continuous_selector.value = [] + self.env_categorical_selector.value = [] + self.status_text = "No 3D variables (e.g. time/lat/lon) found in the file." + self.alert.object = self.status_text + return + + self.env_variable_sources = var_file_map + all_labels = list(var_file_map.keys()) + + self.env_continuous_selector.options = all_labels + self.env_categorical_selector.options = all_labels + self.env_continuous_selector.value = [] + self.env_categorical_selector.value = [] + + self.status_text = ( + f"Loaded {len(all_labels)} variable(s). " + "Now split them into Continuous vs Categorical/QC." + ) + self.alert.object = self.status_text + self._sync_nc_column_heights() + + + @try_catch("Error loading boundary data") + def load_boundary_data(self, *events): + self.status_text = "Loading boundary data..." + self.alert.object = self.status_text + + file_input = self.bound_data_selector.value + + if not file_input: + self.status_text = "Please select one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + + # If multiple files are selected + if isinstance(file_input, list): + if len(file_input) != 1: + self.status_text = "Please select exactly one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + file_path = file_input[0] + else: + file_path = file_input + + try: + path, S, N, W, E = load_vector_extent_info(file_path) + self.boundary_path = path + self.boundary_info_str.object = ( + f"Boundary file: {Path(path).name}
" + f"Spatial range: lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + self.status_text = ( + f"Boundary loaded from {Path(path).name}: " + f"lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to read vector file: {e}" + self.alert.object = self.status_text + self._sync_nc_column_heights() + + + @try_catch("Error loading movement data") + def load_movement_data(self, *events): + self.status_text = "Loading movement data..." + self.alert.object = self.status_text + + file_path = self.movement_data_selector.value + if not file_path: + self.status_text = "No movement file selected." + self.alert.object = self.status_text + return + + df, taxa, ids, err = load_taxa_and_ids_from_csv(file_path) + if err: + self.status_text = f"Error: {err}" + self.alert.object = self.status_text + return + + # normalize headings + df.columns = [re.sub(r"[-._\s]+", "_", col.lower()) for col in df.columns] + if "location_long" in df.columns and "location_lon" not in df.columns: + df["location_lon"] = df["location_long"] + self.df = df + self.id_multiselect.options = ids + self.id_multiselect.disabled = False + self.taxon_multiselect.options = taxa + self.taxon_multiselect.disabled = False + self.status_text = f"Loaded {len(ids)} IDs and {len(taxa)} taxon names." + cols = set(df.columns) + # TIME + time_col = next((c for c in ("timestamp","time","datetime","date") if c in cols), None) + ts = pd.to_datetime(df[time_col], errors="coerce") if time_col else None + time_text = "-" + if ts is not None and ts.notna().any(): + tmin, tmax = ts.min(), ts.max() + time_text = f"Time range: {tmin:%Y-%m-%d %H:%M:%S} — {tmax:%Y-%m-%d %H:%M:%S}" + + # SPATIAL + lat_col = next((c for c in ("location_lat","latitude","lat","y") if c in cols), None) + lon_col = next((c for c in ("location_lon","longitude","lon","x") if c in cols), None) + spatial_text = "-" + if lat_col and lon_col: + lat = pd.to_numeric(df[lat_col], errors="coerce") + lon = pd.to_numeric(df[lon_col], errors="coerce") + if lat.notna().any() and lon.notna().any(): + spatial_text = (f"Spatial range: " + f"lat[{float(lat.min()):.3f}..{float(lat.max()):.3f}], " + f"lon[{float(lon.min()):.3f}..{float(lon.max()):.3f}]") + + lines = (self.movement_info.object or + "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
").split("
") + for i, line in enumerate(lines): + if line.startswith("Time range:"): + lines[i] = time_text + if line.startswith("Spatial range:"): + lines[i] = spatial_text + self.movement_info.object = "
".join(lines) + + self.alert.object = self.status_text + self._sync_nc_column_heights() + + def _get_selected_env_vars(self): + cont = list(getattr(self.env_continuous_selector, "value", []) or []) + cat = list(getattr(self.env_categorical_selector, "value", []) or []) + seen = set() + out = [] + for v in cont + cat: + if v not in seen: + seen.add(v) + out.append(v) + return out + + + @try_catch("Error during annotation") + def run_annotation(self, *events): + self.status_text = "Running annotation..." + self.alert.object = self.status_text + try: + continuous_vars = list(getattr(self.env_continuous_selector, "value", []) or []) + categorical_vars = list(getattr(self.env_categorical_selector, "value", []) or []) + # Preserve variable order without duplicates + seen = set() + selected_vars = [] + for v in continuous_vars + categorical_vars: + if v not in seen: + seen.add(v) + selected_vars.append(v) + + selected_ids = self.id_multiselect.value + env_var_map = getattr(self, "env_variable_sources", {}) + movebank_path = self.movement_data_selector.value + boundary_path = getattr(self, "boundary_path", None) + interpolation_method = self._normalize_interp_key(self.interpolation_method.value) + spatial_mode = self.env_spatial_mode.value + + if spatial_mode == "Projected (x/y)" and interpolation_method != "bilinear": + self.status_text = ( + "Projected (x/y) mode currently supports only " + "Bilinear (projected x/y, time-linear) interpolation." + ) + self.alert.object = self.status_text + return + + if spatial_mode == "Geographic (lat/lon)" and interpolation_method == "bilinear": + self.status_text = ( + "Bilinear projected interpolation requires Projected (x/y) mode." + ) + self.alert.object = self.status_text + return + smoothing_points = int(self.control_smoothing.value) + + if not selected_vars: + self.status_text = "No environmental variables selected." + elif not selected_ids: + self.status_text = "No individual IDs selected." + elif not movebank_path: + self.status_text = "No Movebank data file selected." + else: + bbox = None + if not boundary_path: + first_var = selected_vars[0] + nc_path = env_var_map.get(first_var) + if not nc_path: + self.status_text = "Cannot derive boundary: missing .nc path for selected variable." + self.alert.object = self.status_text + return + + if self.env_spatial_mode.value == "Geographic (lat/lon)": + try: + bounds = get_nc_bounds(nc_path, env_coord_names={ + "env_time": self.nc_time_var.value, + "env_lat": self.nc_lat_var.value, + "env_lon": self.nc_lon_var.value, + "env_x": None, + "env_y": None, + }) + bbox = bounds + self.boundary_info_str.object = ( + "Boundary file: not selected (auto from .nc)
" + f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " + f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to derive boundary from .nc: {e}" + self.alert.object = self.status_text + return + else: + bbox = None + self.boundary_info_str.object = ( + "Boundary file: not selected
" + "Spatial range: using projected grid extent (x/y); bbox cropping disabled." + ) + + self.status_text = "Annotation started." + + if self.env_spatial_mode.value == "Projected (x/y)": + coord_spec = None # bilinear не використовує lat/lon coord_spec + env_coord_names = { + "env_time": self.nc_time_var.value, + "env_lat": None, + "env_lon": None, + "env_x": self.env_x_select.value, + "env_y": self.env_y_select.value, + } + + if not (self.nc_time_var.value and self.env_x_select.value and self.env_y_select.value): + self.status_text = "Please select Time, X and Y variables from the NetCDF file." + self.alert.object = self.status_text + return + + if interpolation_method == "bilinear" and categorical_vars: + self.status_text = ( + "Bilinear projected interpolation is only valid for continuous variables. " + "Please remove categorical/QC variables or use Nearest/IDW mode." + ) + self.alert.object = self.status_text + return + + else: + coord_spec = { + "time": self.nc_time_var.value, + "lat": self.nc_lat_var.value, + "lon": self.nc_lon_var.value, + } + env_coord_names = { + "env_time": self.nc_time_var.value, + "env_lat": self.nc_lat_var.value, + "env_lon": self.nc_lon_var.value, + "env_x": None, + "env_y": None, + } + + if not (self.nc_time_var.value and self.nc_lat_var.value and self.nc_lon_var.value): + self.status_text = "Please select Time, Latitude and Longitude variables from the NetCDF file." + self.alert.object = self.status_text + return + + start_annotation_process( + env_var_map, + selected_vars, + movebank_path, + selected_ids, + boundary_path, + interpolation_method, + bbox=bbox, + smoothing_k=smoothing_points, + out_csv_path=self.output_path.value, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars, + ) + self.status_text = "Annotation finished." + + except Exception as e: + self.status_text = f"Annotation failed: {e}" + + self.alert.object = self.status_text + + + ####TIF + @try_catch("Error loading TIF environmental data") + def load_env_data_tif(self, *events): + """ + Load environmental data from an AppEEARS GeoTIFF folder, convert it to a + single multi-variable NetCDF, and populate the TIF tab UI. + + Workflow: + 1) Validate that the user selected any *.tif in the target folder. + 2) Use the TIF folder as the output directory for the generated temporary NetCDF. + 3) Convert the set of TIFs in that folder → one NetCDF via + `convert_tif_to_nc_before_annotation` (each parsed variable = separate DataArray). + 4) Open the produced NetCDF with `safe_open_nc_with_time_decoding` and: + - extract Time range and Spatial extent, + - build `tif_env_var_map` ONLY for variables that are 3D and have a time dimension. + 5) Update the UI: + - Info panel (File/Time/Spatial), + - Multiselect options/values, + - Status text. + + Notes: + - The resulting `self.tif_env_var_map` is later used by `run_annotation_tif()` directly, + so we avoid re-reading all `data_vars` again. + - `self.tif_nc_path` is stored for fallbacks (e.g., bbox from nc if no boundary). + """ + # 0) Initial UI/status + self.status_text = "Loading TIF environmental data..." + self.alert.object = self.status_text + + # 1) Validate a sample TIF and collect folder + tif_sample_path = Path(getattr(self.tif_env_data_selector, "value", "") or "") + if (not tif_sample_path.is_file()) or (tif_sample_path.suffix.lower() != ".tif"): + self.status_text = f"Selected path is not a .tif file: {tif_sample_path}" + self.alert.object = self.status_text + return + + folder_path = tif_sample_path.parent + tif_files = sorted([str(p) for p in folder_path.glob("*.tif") if p.is_file()]) + if not tif_files: + self.status_text = f"No .tif files found in: {folder_path}" + self.alert.object = self.status_text + return + + # 2) Write the temporary NetCDF next to the source TIF files. + # Movebank data is not required at this stage. + # The temporary NetCDF is always saved next to the input TIF files. + output_dir = str(folder_path) + + # 3) Convert TIF to NetCDF + try: + nc_path = convert_tif_to_nc_before_annotation(tif_files, output_dir) + except Exception as e: + self.status_text = f"Failed to convert TIF to NetCDF: {e}" + self.alert.object = self.status_text + return + + # Cache for later (bbox fallback, re-open, etc.) + self.tif_nc_path = nc_path + + # 4) Inspect NetCDF and keep ONLY 3D variables with a time dimension + var_file_map: dict[str, str] = {} + time_text = "Time range: -" + spatial_text = "Spatial range: -" + + try: + ds = safe_open_nc_with_time_decoding(nc_path) + + # Time range (if present) + if ("time" in ds.coords) or ("time" in ds.variables): + try: + tmin = pd.to_datetime(ds["time"].values.min()) + tmax = pd.to_datetime(ds["time"].values.max()) + time_text = f"Time range: {tmin.strftime('%Y-%m-%d')} — {tmax.strftime('%Y-%m-%d')}" + except Exception: + # Keep default if something goes wrong + pass + + # Spatial extent (lat/lon candidates can vary) + lat_name = next((c for c in ("lat", "latitude", "y") if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in ("lon", "longitude", "x","long") if c in ds.coords or c in ds.variables), None) + if lat_name and lon_name: + try: + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + spatial_text = ( + f"Spatial range: lat[{lat_min:.3f}..{lat_max:.3f}], " + f"lon[{lon_min:.3f}..{lon_max:.3f}]" + ) + except Exception: + pass + + # Build map: ONLY variables that (a) have a 'time' dim and (b) are 3D or higher + var_names: list[str] = [] + for v in ds.data_vars: + da = ds[v] + if ("time" in da.dims) and (da.ndim >= 3): + var_file_map[v] = nc_path + var_names.append(v) + + except Exception as e: + self.status_text = f"Failed to open/inspect NetCDF: {e}" + self.alert.object = self.status_text + return + finally: + try: + ds.close() + except Exception: + pass + + # 5) Update UI: info panel, multiselect, status + # Info panel (use common helper to insert/replace rows) + self._update_info_lines(self.tif_env_info, { + "File:": Path(nc_path).name, + "Time range:": time_text.replace("Time range: ", ""), + "Spatial range:": spatial_text.replace("Spatial range: ", "") + }) + + if not var_file_map: + # No valid 3D variables (time/lat/lon) found + self.tif_env_var_map = {} + + self.tif_env_data_multiselect.options = [] + self.tif_env_data_multiselect.value = [] + + self.tif_continuous_vars.options = [] + self.tif_continuous_vars.value = [] + + self.tif_categorical_vars.options = [] + self.tif_categorical_vars.value = [] + + self.status_text = "No 3D (time/lat/lon) variables found in the generated NetCDF." + self.alert.object = self.status_text + return + + # Save valid TIF variables for annotation. + self.tif_env_var_map = var_file_map + self.tif_env_data_multiselect.options = var_names + self.tif_env_data_multiselect.value = [] + + # Populate TIF variable type selectors. + # This is an initial guess only; the user can manually change it. + continuous_guess, categorical_guess = self._guess_tif_variable_types(var_names) + + self.tif_continuous_vars.options = var_names + self.tif_categorical_vars.options = var_names + + self.tif_continuous_vars.value = continuous_guess + self.tif_categorical_vars.value = categorical_guess + + # Update info panel using the actual selected split + selected_for_info = continuous_guess + [ + v for v in categorical_guess + if v not in continuous_guess + ] + self.update_env_info_text_tif(selected_for_info) + + # Final status + self.status_text = ( + f"Converted {len(tif_files)} TIF files to NetCDF. " + f"Variables (3D/time): {', '.join(var_names)}. " + "Please check Continuous vs Categorical/QC selection." + ) + self.alert.object = self.status_text + + + @try_catch("Error running TIF annotation") + def run_annotation_tif(self, *events): + """ + Run annotation workflow for environmental data sourced from AppEEARS GeoTIFFs. + + Current TIF workflow: + 1) Validate user selections: + - Movebank CSV is required. + - A sample .tif file is required to identify the target TIF folder. + - Boundary file is optional; if it is not provided, the NetCDF extent is used. + + 2) Gather all *.tif files from the selected TIF folder. + + 3) Convert the TIF stack to a temporary NetCDF via + `convert_tif_to_nc_before_annotation(...)`. + + Important: + - The temporary NetCDF is written to the same folder as the input TIF files. + - The conversion keeps raw raster values. + - No scale factor, add_offset, or automatic 0.0001 heuristic is applied during + TIF -> NetCDF conversion. + + 4) Build `env_var_map` for variables that are valid for annotation: + - variables must have a time dimension; + - variables must be at least 3D, typically variable(time, lat, lon). + + 5) Determine variables to annotate from the explicit type selectors: + - `self.tif_continuous_vars` + - `self.tif_categorical_vars` + + The same variable must not be selected in both lists. + + 6) Run annotation through `start_annotation_process(...)`. + + Continuous variables: + - use the selected spatial interpolation method; + - use linear temporal interpolation; + - may optionally receive post-sampling value correction: + corrected_value = sampled_value * scale_factor + add_offset. + + Categorical/QC variables: + - are sampled using nearest spatial grid cell and nearest available timestep; + - are not IDW-averaged; + - are not linearly interpolated in time; + - are not scaled or offset; + - remain raw category/flag/QC codes. + + 7) Save the annotated output CSV and per-individual CSV files through the backend. + + Required UI widgets: + - `self.tif_movement_data_selector`: + Movebank CSV path. + - `self.tif_env_data_selector`: + one sample .tif file inside the target TIF folder. + - `self.tif_continuous_vars`: + continuous environmental variables selected for annotation. + - `self.tif_categorical_vars`: + categorical/QC variables selected for annotation. + - `self.tif_id_multiselect`: + selected individual IDs. + - `self.tif_bound_data_selector`: + optional boundary file. + - `self.tif_interpolation_method`: + spatial interpolation method for continuous variables. + - `self.tif_control_smoothing`: + number of nearest grid points for IDW. + - `self.tif_apply_scale`, `self.tif_scale_factor`, `self.tif_add_offset`: + optional post-sampling correction for continuous variables only. + - `self.tif_output_path`: + output CSV path. + + Status messages are written to `self.status_text` and mirrored in `self.alert.object`. + """ + self.status_text = "Starting annotation (TIF)…" + self.alert.object = self.status_text + + # 0) Validate inputs + # Movebank CSV (required) + movebank_path = getattr(self.tif_movement_data_selector, "value", None) + if not movebank_path or not Path(str(movebank_path)).is_file(): + self.status_text = "Please load Movebank data before running TIF annotation." + self.alert.object = self.status_text + return + + output_dir = str(Path(str(movebank_path)).parent) + + # Sample TIF file (to infer the target folder) + tif_sample = getattr(self, "tif_env_data_selector", None) + tif_sample = getattr(tif_sample, "value", None) + if not tif_sample or Path(tif_sample).suffix.lower() != ".tif": + self.status_text = "Please select a sample .tif file in the folder you want to annotate." + self.alert.object = self.status_text + return + + # Selected animal IDs (optional) + id_widget = getattr(self, "tif_id_multiselect", None)# or getattr(self, "id_multiselect", None) + selected_ids = list(getattr(id_widget, "value", [])) if id_widget else [] + if not selected_ids: + self.status_text = "Please select at least one individual ID before running TIF annotation." + self.alert.object = self.status_text + return + + # Optional boundary + bound_widget = getattr(self, "tif_bound_data_selector", None)# or getattr(self, "bound_data_selector", None) + boundary_path = getattr(bound_widget, "value", None) + if boundary_path and not Path(boundary_path).is_file(): + print(f"[WARN] Boundary file not found: {boundary_path}. Proceeding without boundary.") + boundary_path = None + + # Interpolation and time-fit options (prefer TIF-tab widgets; fallback to NC-tab) + interp_widget = getattr(self, "tif_interpolation_method", None) + #??? interp_method = getattr(interp_widget, "value", "Nearest neighbor (time-linear)") + ui_method = getattr(interp_widget, "value", "Nearest neighbor (time-linear)") + interp_method = self._normalize_interp_key(ui_method) + # Output CSV path (optional) + out_widget = getattr(self, "tif_output_path", None) + output_csv_path = getattr(out_widget, "value", None) + + # 1) Collect TIFs from the selected folder + folder_path = Path(tif_sample).parent + tif_paths = sorted(p for p in folder_path.glob("*.tif") if p.is_file()) + if not tif_paths: + self.status_text = f"No .tif files found in: {folder_path}" + self.alert.object = self.status_text + return + + # 2) Convert TIF → NetCDF (multi-variable, raw values only) + # Scale/offset is not applied here; optional correction is applied after sampling. + output_dir = str(folder_path) + nc_path = convert_tif_to_nc_before_annotation([str(p) for p in tif_paths], output_dir) + self.tif_nc_path = nc_path + + # 3) Read valid variables from NetCDF and build env_var_map + if getattr(self, "tif_env_var_map", None): + env_var_map = dict(self.tif_env_var_map) + var_names = list(env_var_map.keys()) + else: + # Fallback: inspect the .nc and keep only 3D with a time dim + env_var_map, var_names = {}, [] + try: + ds = safe_open_nc_with_time_decoding(nc_path) + try: + for v in ds.data_vars: + da = ds[v] + if ("time" in da.dims) and (da.ndim >= 3): + env_var_map[v] = nc_path + var_names.append(v) + finally: + ds.close() + except Exception as e: + self.status_text = f"Failed to read variables from NetCDF: {e}" + self.alert.object = self.status_text + return + + if not var_names: + self.status_text = "No 3D (time/lat/lon) variables found in the generated NetCDF." + self.alert.object = self.status_text + return + + # 4) Which variables to annotate? + continuous_vars = list(getattr(self.tif_continuous_vars, "value", []) or []) + categorical_vars = list(getattr(self.tif_categorical_vars, "value", []) or []) + + overlap = set(continuous_vars) & set(categorical_vars) + if overlap: + self.status_text = ( + "The same variable cannot be selected as both Continuous and Categorical/QC: " + + ", ".join(sorted(overlap)) + ) + self.alert.object = self.status_text + return + + selected_vars = continuous_vars + [ + v for v in categorical_vars + if v not in continuous_vars + ] + + if not selected_vars: + self.status_text = "Please select at least one Continuous or Categorical/QC variable." + self.alert.object = self.status_text + return + + # 5) Kick off annotation + scale_msg = ( + f"scale={self.tif_scale_factor.value}, offset={self.tif_add_offset.value}" + if self.tif_apply_scale.value + else "off" + ) + + self.status_text = ( + f"Annotating variables: {', '.join(selected_vars)} | " + f"Continuous: {', '.join(continuous_vars) if continuous_vars else '-'} | " + f"Categorical/QC: {', '.join(categorical_vars) if categorical_vars else '-'} | " + f"Scale/offset: {scale_msg} | " + f"IDs: {len(selected_ids) if selected_ids else 'all/unspecified'} | " + f"Interpolation: {interp_method}" + ) + self.alert.object = self.status_text + + try: + start_loading_spinner() + except Exception: + pass + + try: + # Auto-bbox from .nc if no boundary file selected + bbox = None + if not boundary_path: + try: + bounds = get_nc_bounds(self.tif_nc_path) # {"S","N","W","E"} + bbox = bounds + self.tif_boundary_info_str.object = ( + "Boundary file: not selected (auto from .nc)
" + f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " + f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + ) + except Exception: + pass + start_annotation_process( + env_var_map=env_var_map, + selected_env_vars=selected_vars, + movebank_path=str(movebank_path), + selected_ids=selected_ids, + boundary_path=str(boundary_path) if boundary_path else None, + interpolation_method=interp_method, + bbox=bbox, + smoothing_k=int(self.tif_control_smoothing.value), + out_csv_path=output_csv_path, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars, + # TIF value correction is applied after sampling, + # and only to continuous variables. + apply_value_correction=bool(self.tif_apply_scale.value), + value_scale_factor=float(self.tif_scale_factor.value), + value_add_offset=float(self.tif_add_offset.value), + value_correction_vars=continuous_vars, + ) + self.status_text = "Annotation finished successfully (TIF)." + self.alert.object = self.status_text + except Exception as e: + self.status_text = f"Annotation failed (TIF): {e}" + self.alert.object = self.status_text + print("[ERROR] Annotation failed (TIF):", e) + finally: + try: + stop_loading_spinner() + except Exception: + pass + + + @try_catch("Error loading TIF boundary data") + def load_boundary_data_tif(self, *events): + self.status_text = "Loading TIF boundary data..." + self.alert.object = self.status_text + + file_input = self.tif_bound_data_selector.value + if not file_input: + self.status_text = "Please select one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + + if isinstance(file_input, list): + if len(file_input) != 1: + self.status_text = "Please select exactly one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + file_path = file_input[0] + else: + file_path = file_input + + try: + path, S, N, W, E = load_vector_extent_info(file_path) + self.boundary_path = path + self.tif_boundary_info_str.object = ( + f"Boundary file: {Path(path).name}
" + f"Spatial range: lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + self.status_text = ( + f"TIF Boundary loaded: " + f"lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to read vector file: {e}" + self.alert.object = self.status_text + + + @try_catch("Error loading TIF movement data") + def load_movement_data_tif(self, *events): + self.status_text = "Loading TIF movement data..." + self.alert.object = self.status_text + + file_path = self.tif_movement_data_selector.value + if not file_path: + self.status_text = "No TIF movement file selected." + self.alert.object = self.status_text + return + + df, taxa, ids, err = load_taxa_and_ids_from_csv(file_path) + if err: + self.status_text = f"Error: {err}" + else: + df.columns = [re.sub(r"[-._\s]+", "_", col.lower()) for col in df.columns] + if "location_long" in df.columns and "location_lon" not in df.columns: + df["location_lon"] = df["location_long"] + self.df = df # shared for both tabs + self.tif_id_multiselect.options = ids + self.tif_id_multiselect.disabled = False + self.tif_taxon_multiselect.options = taxa + self.tif_taxon_multiselect.disabled = False + self.status_text = f"TIF: Loaded {len(ids)} IDs and {len(taxa)} taxon names." + mv_current = self.tif_movement_info.object or "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -" + lines = mv_current.split("
") + if lines: + lines[0] = f"File: {Path(file_path).name}" + + # + try: + ts = pd.to_datetime(df["timestamp"], errors="coerce") + lat = pd.to_numeric(df["location_lat"], errors="coerce") + lon = pd.to_numeric(df["location_lon"], errors="coerce") + if ts.notna().any(): + tmin = ts.min().strftime("%Y-%m-%d %H:%M:%S") + tmax = ts.max().strftime("%Y-%m-%d %H:%M:%S") + for i, line in enumerate(lines): + if line.startswith("Time range:"): + lines[i] = f"Time range: {tmin} — {tmax}" + if lat.notna().any() and lon.notna().any(): + lat_min, lat_max = float(lat.min()), float(lat.max()) + lon_min, lon_max = float(lon.min()), float(lon.max()) + for i, line in enumerate(lines): + if line.startswith("Spatial range:"): + lines[i] = f"Spatial range: lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" + + except Exception: + pass + + self.tif_movement_info.object = "
".join(lines) + self.alert.object = self.status_text + + + @try_catch("Interpolation (missing only) failed") + def run_interpolate_missing_only(self, *events): + # 1) input + csv_path = Path(self.local_ID_file.value) + if not csv_path.exists(): + self.status_text = "No file selected." + self.alert.object = self.status_text + return + + # 2) Determine the ID: if the user did not choose, take all + if self.df is None: + try: + df_tmp = pd.read_csv(csv_path) + df_tmp.columns = [re.sub(r"[-._:\s]+", "_", c.lower()) for c in df_tmp.columns] + except Exception as e: + self.status_text = f"Failed to read CSV: {e}" + self.alert.object = self.status_text + return + all_ids = sorted(df_tmp.get("individual_local_identifier", pd.Series([], dtype=str)).dropna().astype(str).unique()) + else: + all_ids = sorted(self.df.get("individual_local_identifier", pd.Series([], dtype=str)).dropna().astype(str).unique()) + + selected_ids = list(self.individual_ID.value) if self.individual_ID.value else all_ids + if not selected_ids: + self.status_text = "No IDs to process." + self.alert.object = self.status_text + return + + # 3) Time range + start_time, end_time = self.time_selection_ID.value + start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S.%f") + end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S.%f") + + # 4) Which columns to interpolate + columns = validate_and_process_csv(csv_path) + + # 5) Call simplified interpolation + out_template = self.out_csv_name.value + created = interpolate_missing_values_only( + start_time_str, end_time_str, csv_path, selected_ids, columns, out_template + ) + # or: + # created = interpolate_missing_values_only(...) + + # 6) result + if created: + self.status_text = f"Interpolation complete. Files: {len(created)}. Example: {created[0]}" + else: + self.status_text = "Interpolation complete. No files created (no eligible gaps ≤ 1 day)." + self.alert.object = self.status_text + + + def update_annotation_ids_by_taxon_tif(self, event): + if self.df is None: + return + + selected_taxa = event.new + if not selected_taxa: + ids = sorted(self.df["individual_local_identifier"].dropna().astype(str).unique()) + else: + filtered = self.df[self.df["individual_taxon_canonical_name"].isin(selected_taxa)] + ids = sorted(filtered["individual_local_identifier"].dropna().astype(str).unique()) + + self.tif_id_multiselect.options = ids + self.tif_id_multiselect.value = ids + + def update_env_info_text(self, selected_vars): + current = self.env_info.object or "" + lines = current.split("
") + updated_lines = [] + found = False + for line in lines: + if "Environment parameters" in line: + updated_lines.append(f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + found = True + else: + updated_lines.append(line) + if not found: + updated_lines.insert(1, f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + self.env_info.object = "
".join(updated_lines) + + + def update_movement_info_text(self, section, new_values): + current = self.movement_info.object or "" + lines = current.split("
") + updated_lines = [] + for line in lines: + if section == "Taxons" and "Taxons" in line: + updated_lines.append(f"Taxons: {', '.join(new_values) if new_values else '-'}") + elif section == "IDs" and "IDs" in line: + updated_lines.append(f"IDs: {', '.join(new_values) if new_values else '-'}") + else: + updated_lines.append(line) + self.movement_info.object = "
".join(updated_lines) + + + def update_env_info_text_tif(self, selected_vars): + current = self.tif_env_info.object or "" + if not current: + current = "File: not selected
Environment parameters: -
Time range: -
Spatial range: -
" + lines = current.split("
") + updated = [] + found = False + for line in lines: + if "Environment parameters" in line: + updated.append(f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + found = True + else: + updated.append(line) + if not found: + updated.insert(1, f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + self.tif_env_info.object = "
".join(updated) + + + def _guess_tif_variable_types(self, variables): + """ + Return initial (continuous, categorical) split for TIF-derived variables. + This is only a first guess. The user can manually change the selection. + """ + categorical_keywords = [ + "qc", + "quality", + "flag", + "mask", + "class", + "category", + "categorical", + "landcover", + "land_cover", + "classification", + "type", + ] + + categorical = [ + v for v in variables + if any(key in str(v).lower() for key in categorical_keywords) + ] + + continuous = [ + v for v in variables + if v not in categorical + ] + + return continuous, categorical + + def _sync_tif_variable_type_selection(self, event=None): + """ + Ensure that the same TIF-derived variable cannot be selected + as both continuous and categorical/QC. + """ + if getattr(self, "_syncing_tif_var_types", False): + return + + self._syncing_tif_var_types = True + try: + continuous = set(self.tif_continuous_vars.value or []) + categorical = set(self.tif_categorical_vars.value or []) + + overlap = continuous & categorical + if not overlap: + return + + # If the user changed Continuous, remove overlap from Categorical/QC. + if event is not None and event.obj is self.tif_continuous_vars: + self.tif_categorical_vars.value = [ + v for v in (self.tif_categorical_vars.value or []) + if v not in overlap + ] + + # If the user changed Categorical/QC, remove overlap from Continuous. + elif event is not None and event.obj is self.tif_categorical_vars: + self.tif_continuous_vars.value = [ + v for v in (self.tif_continuous_vars.value or []) + if v not in overlap + ] + + finally: + self._syncing_tif_var_types = False + + def update_movement_info_text_tif(self, section, new_values): + current = self.tif_movement_info.object or "" + if not current: + current = "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
" + lines = current.split("
") + updated = [] + for line in lines: + if section == "Taxons" and "Taxons" in line: + updated.append(f"Taxons: {', '.join(new_values) if new_values else '-'}") + elif section == "IDs" and "IDs" in line: + updated.append(f"IDs: {', '.join(new_values) if new_values else '-'}") + else: + updated.append(line) + self.tif_movement_info.object = "
".join(updated) + + + def _update_info_lines(self, pane, changes: dict): + """ + Safely updates rows in pane.object by tags: + changes = {"File:": "...", "Time range:": "...", "Spatial range:": "...", "Environment parameters:": "..."} + If the row with the tag does not exist, it is added. + """ + default = "File: not selected
Environment parameters: -
Time range: -
Spatial range: -
" + current = pane.object or default + lines = current.split("
") + idx = {} + for i, line in enumerate(lines): + for key in changes.keys(): + if line.strip().startswith(key): + idx[key] = i + + for key, val in changes.items(): + if key in idx: + lines[idx[key]] = f"{key} {val}" + else: + # insert at the end before the empty last one, if there is one + insert_pos = len(lines) - 1 if lines and lines[-1] == "" else len(lines) + lines.insert(insert_pos, f"{key} {val}") + + pane.object = "
".join(lines) + + + def _section(self, title, *items, height=None): + body = pn.Column(*items, sizing_mode="stretch_width") + + return pn.Card( + body, + title=title, + collapsible=False, + margin=(0, 0, 10, 0), + sizing_mode="stretch_width", + height=height, + ) + + + def _auto_height(self, pane, line_px=22, padding=8): + lines = [l for l in (pane.object or "").split("
") if l.strip()] + pane.height = line_px * max(1, len(lines)) + padding + + + def _update_smoothing_options(self, event): + key = self._normalize_interp_key(event.new) + + if key in ("nearest", "bilinear"): + self.control_smoothing.options = ["1"] + self.control_smoothing.value = "1" + self.control_smoothing.disabled = (key == "bilinear") + else: + self.control_smoothing.disabled = False + self.control_smoothing.options = ["2", "4", "6", "8"] + if self.control_smoothing.value == "1": + self.control_smoothing.value = "4" + + def _update_tif_scale_widgets(self, event=None): + """ + Enable scale factor / offset inputs only when post-sampling value correction is enabled. + """ + enabled = bool(self.tif_apply_scale.value) + self.tif_scale_factor.disabled = not enabled + self.tif_add_offset.disabled = not enabled + + def _update_smoothing_options_tif(self, event): + key = self._normalize_interp_key(event.new) + if key == "nearest": + self.tif_control_smoothing.options = ["1"] + self.tif_control_smoothing.value = "1" + else: + self.tif_control_smoothing.options = ["2", "4", "6", "8"] + if self.tif_control_smoothing.value == "1": + self.tif_control_smoothing.value = "4" + + + def _sync_nc_column_heights(self): + """Adjusts the height of the 2nd and 3rd columns to the 1st.""" + first = getattr(self, "_nc_col1", None) + second = getattr(self, "_nc_col2", None) + third = getattr(self, "_nc_col3", None) + if not first or not second or not third: + return + + if first.height is None: + pn.state.onload(lambda: self._apply_nc_height_from_first()) + else: + self._apply_nc_height_from_first() + + + def _apply_nc_height_from_first(self): + first = self._nc_col1 + if not first: + return + h = first.height + if h is None: + return + self._nc_col2.height = h + self._nc_col3.height = h + + + def reset_boundary_data(self, *events): + """ + Resets boundary to default: no file selected, range = environment boundary (.nc). + Also clears self.boundary_path so annotation goes back to 'auto from .nc' mode. + """ + self.boundary_path = None + default_nc = "Boundary file: not selected
Spatial range: = environment data boundary" + default_tif = "Boundary file: not selected
Spatial range: = environment data boundary" + try: + self.boundary_info_str.object = default_nc + except Exception: + pass + try: + self.tif_boundary_info_str.object = default_tif + except Exception: + pass + + self.status_text = "Boundary reset to default (auto from .nc)." + self.alert.object = self.status_text + self._sync_nc_column_heights() + + +@register_view() +def view(): + viewer = movebank_annotation_engine() + template = DEFAULT_TEMPLATE(main=[viewer.alert, viewer.view]) + return template + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/app/apps/gridded_data_explorer_app.py b/ecodata/app/apps/gridded_data_explorer_app.py index 0acb05d..e81911e 100644 --- a/ecodata/app/apps/gridded_data_explorer_app.py +++ b/ecodata/app/apps/gridded_data_explorer_app.py @@ -163,8 +163,10 @@ class GriddedDataExplorer(param.Parameterized): ) # Progress bar and percent for saving - progress_indicator = param.ClassSelector(pn.indicators.Progress) - progress_percent = param.ClassSelector(pn.widgets.StaticText) + #progress_indicator = param.ClassSelector(pn.indicators.Progress) + progress_indicator = param.ClassSelector(class_=pn.indicators.Progress) + #progress_percent = param.ClassSelector(pn.widgets.StaticText) + progress_percent = param.ClassSelector(class_=pn.widgets.StaticText) # Save statistics stats_fname = param_widget( diff --git a/ecodata/app/apps/multidimensional_annotation_app.py b/ecodata/app/apps/multidimensional_annotation_app.py new file mode 100644 index 0000000..3bf2c4a --- /dev/null +++ b/ecodata/app/apps/multidimensional_annotation_app.py @@ -0,0 +1,1017 @@ +""" +Multidimensional Annotation UI +""" + +from __future__ import annotations + +import logging +import re +from pathlib import Path +from typing import List, Optional, Tuple + +import panel as pn +import pandas as pd + +try: + import xarray as xr +except Exception: # pragma: no cover + xr = None + +try: + import geopandas as gpd +except Exception: # pragma: no cover + gpd = None + +from ecodata.app.config import DEFAULT_TEMPLATE +from ecodata.app.models import FileSelector +from ecodata.panel_utils import register_view + +try: + from ecodata.annotation_eng_func import safe_open_nc_with_time_decoding, load_vector_extent_info +except Exception: # pragma: no cover + safe_open_nc_with_time_decoding = None + load_vector_extent_info = None + +try: + from ecodata.multidim_annotation_func import run_multidimensional_annotation_from_paths +except Exception as exc: # pragma: no cover + run_multidimensional_annotation_from_paths = None + BACKEND_IMPORT_ERROR = exc +else: + BACKEND_IMPORT_ERROR = None + +logger = logging.getLogger(__name__) + + +class Multidimensional_Annotation_App: + def __init__(self): + self.name = "Multidimensional Annotation Engine App (DEMO)" + self._movement_columns: List[str] = [] + self._movement_df: Optional[pd.DataFrame] = None + self.boundary_path: Optional[str] = None + + def make_file_selector(name: str, file_pattern: str = "*") -> FileSelector: + return FileSelector( + name=name, + directory=str(Path.home()), + file_pattern=file_pattern, + only_files=True, + constrain_path=False, + expanded=True, + size=10, + sizing_mode="stretch_width", + ) + + self.movement_csv = make_file_selector("Movement CSV", "*.csv") + self.load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary", sizing_mode="stretch_width") + + self.taxon_multiselect = pn.widgets.MultiSelect( + name="Select Taxon (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + self.id_multiselect = pn.widgets.MultiSelect( + name="Select ID (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + + self.id_column = pn.widgets.Select(name="ID column", options=[], value=None, sizing_mode="stretch_width") + self.time_column = pn.widgets.Select(name="Timestamp column", options=[], value=None, sizing_mode="stretch_width") + self.lat_column = pn.widgets.Select(name="Latitude column", options=[], value=None, sizing_mode="stretch_width") + self.lon_column = pn.widgets.Select(name="Longitude column", options=[], value=None, sizing_mode="stretch_width") + self.height_column = pn.widgets.Select(name="Height / altitude column", options=[], value=None, sizing_mode="stretch_width") + self.height_units = pn.widgets.Select(name="Height units", options=["m"], value="m", sizing_mode="stretch_width") + self.height_reference = pn.widgets.Select( + name="Height reference", + options=[ + "WGS84 ellipsoidal height (Movebank GPS height)", + "Already orthometric / MSL-like", + "Height above ground level (requires DEM)", + ], + value="WGS84 ellipsoidal height (Movebank GPS height)", + sizing_mode="stretch_width", + ) + self.geoid_mode = pn.widgets.Select( + name="Geoid correction", options=["geographiclib", "constant", "none"], value="geographiclib", sizing_mode="stretch_width" + ) + self.constant_geoid_undulation_m = pn.widgets.FloatInput( + name="Constant geoid undulation N, m", value=0.0, step=1.0, sizing_mode="stretch_width" + ) + self.movement_info = pn.pane.HTML( + "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width", + ) + + self.geopotential_file = make_file_selector("Geopotential file", "*.nc") + self.scan_geopotential_button = pn.widgets.Button(name="Scan geopotential file", button_type="primary", sizing_mode="stretch_width") + self.geopotential_variable = pn.widgets.Select(name="Geopotential variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_time_var = pn.widgets.Select(name="Time variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_lat_var = pn.widgets.Select(name="Latitude variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_lon_var = pn.widgets.Select(name="Longitude variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_level_var = pn.widgets.Select(name="Vertical / level variable", options=[], value=None, sizing_mode="stretch_width") + self.geopotential_units = pn.widgets.Select( + name="Geopotential units", options=["m2 s-2", "geopotential metres"], value="m2 s-2", sizing_mode="stretch_width" + ) + self.convert_geopotential_to_height = pn.widgets.Checkbox( + name="Convert geopotential to height using z / 9.80665", value=True, sizing_mode="stretch_width" + ) + self.gravity_constant = pn.widgets.FloatInput(name="Gravity constant", value=9.80665, step=0.00001, disabled=True) + + self.multilevel_var_file = make_file_selector("Annotated var (multilevel) file", "*.nc") + self.scan_multilevel_button = pn.widgets.Button(name="Scan multilevel file", button_type="primary", sizing_mode="stretch_width") + self.multilevel_variable = pn.widgets.Select(name="Annotated var (multilevel, first selected)", options=[], value=None, sizing_mode="stretch_width") + self.multilevel_continuous_vars = pn.widgets.MultiSelect( + name="Continuous multilevel variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=180, sizing_mode="stretch_width" + ) + self.multilevel_categorical_vars = pn.widgets.MultiSelect( + name="Categorical/QC multilevel variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=180, sizing_mode="stretch_width" + ) + + self.surface_var_file = make_file_selector("Annotated var (surface) file", "*.nc") + self.scan_surface_button = pn.widgets.Button(name="Scan surface file", button_type="default", sizing_mode="stretch_width") + self.surface_variable = pn.widgets.Select(name="Annotated var (surface, first selected)", options=[], value=None, sizing_mode="stretch_width") + self.surface_continuous_vars = pn.widgets.MultiSelect( + name="Continuous surface variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + self.surface_categorical_vars = pn.widgets.MultiSelect( + name="Categorical/QC surface variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + self.env_info = pn.pane.HTML( + "File: not selected
Multilevel parameters: -
Surface parameters: -
Time range: -
Spatial range: -
Vertical levels: -
", + sizing_mode="stretch_width", + ) + + self.boundary_file = make_file_selector("Boundary data (.shp/.geojson)", "*") + self.load_boundary_button = pn.widgets.Button(name="Load boundary data", button_type="primary", sizing_mode="stretch_width") + self.reset_boundary_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary", sizing_mode="stretch_width") + self.boundary_info = pn.pane.HTML( + "Boundary file: not selected
Spatial range: = environmental data boundary", sizing_mode="stretch_width" + ) + + self.spatial_interpolation_method = pn.widgets.Select( + name="Spatial interpolation method", + options=["Nearest neighbor", "Inverse Distance Weighting"], + value="Nearest neighbor", + sizing_mode="stretch_width", + ) + self.control_smoothing = pn.widgets.Select( + name="Number of nearest grid points", options=["1", "2", "4", "6", "8"], value="1", sizing_mode="stretch_width" + ) + self.vertical_matching_method = pn.widgets.Select( + name="Vertical matching method", + options=["Nearest geopotential-height level", "Linear vertical interpolation"], + value="Nearest geopotential-height level", + sizing_mode="stretch_width", + ) + self.use_surface_as_lower_anchor = pn.widgets.Checkbox( + name="Use surface variable as lower vertical anchor", value=True, sizing_mode="stretch_width" + ) + self.surface_anchor_height_agl_m = pn.widgets.FloatInput( + name="Surface anchor height above ground, m", value=2.0, step=0.5, sizing_mode="stretch_width" + ) + + self.u_file = make_file_selector("U wind component file", "*.nc") + self.u_variable = pn.widgets.Select(name="U variable", options=[], value=None, sizing_mode="stretch_width") + self.v_file = make_file_selector("V wind component file", "*.nc") + self.v_variable = pn.widgets.Select(name="V variable", options=[], value=None, sizing_mode="stretch_width") + self.w_file = make_file_selector("W vertical velocity file", "*.nc") + self.w_variable = pn.widgets.Select(name="W variable", options=[], value=None, sizing_mode="stretch_width") + self.temperature_file = make_file_selector("Temperature file", "*.nc") + self.temperature_variable = pn.widgets.Select(name="Temperature variable", options=[], value=None, sizing_mode="stretch_width") + self.scan_optional_components_button = pn.widgets.Button( + name="Scan optional component files", button_type="default", sizing_mode="stretch_width" + ) + + self.topography_source = pn.widgets.Select( + name="Topography source", + options=["None", "ETOPO1 Ice Surface Global Relief Model", "SRTM 1 Arc-Second DEM", "ASTER ASTGTM3 Global 30-m DEM", "Custom DEM / GeoTIFF"], + value="None", + sizing_mode="stretch_width", + ) + self.dem_file = make_file_selector("DEM file", "*.tif") + self.dem_units = pn.widgets.Select(name="DEM vertical units", options=["m"], value="m", disabled=True, sizing_mode="stretch_width") + self.dem_reference = pn.widgets.Select( + name="DEM reference", options=["Assumed orthometric / MSL-like"], value="Assumed orthometric / MSL-like", disabled=True, sizing_mode="stretch_width" + ) + + self.derive_wind_support_crosswind = pn.widgets.Checkbox( + name="Wind support and cross wind", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_wind_speed_direction = pn.widgets.Checkbox( + name="Wind speed and direction", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_vertical_motion = pn.widgets.Checkbox( + name="Vertical motion from W", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_thermal_uplift = pn.widgets.Checkbox( + name="Thermal uplift / stability proxy", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_orographic_uplift = pn.widgets.Checkbox( + name="Orographic uplift", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.track_direction_source = pn.widgets.Select( + name="Track direction source", + options=["Compute from consecutive points", "Use existing heading column"], + value="Compute from consecutive points", + disabled=True, + sizing_mode="stretch_width", + ) + self.heading_column = pn.widgets.Select(name="Heading column", options=[], value=None, disabled=True, sizing_mode="stretch_width") + + self.output_csv = pn.widgets.TextInput( + name="Output CSV", value=str(Path.home() / "Downloads" / "multidimensional_annotation_output.csv"), sizing_mode="stretch_width" + ) + self.save_per_individual = pn.widgets.Checkbox(name="Save per individual", value=True, sizing_mode="stretch_width") + self.keep_diagnostics = pn.widgets.Checkbox(name="Keep diagnostic columns", value=True, sizing_mode="stretch_width") + self.validate_button = pn.widgets.Button(name="Validate configuration", button_type="primary", sizing_mode="stretch_width") + self.run_button = pn.widgets.Button(name="Run multidimensional annotation", button_type="primary", sizing_mode="stretch_width") + + self.preview = pn.pane.Markdown("### Preview\nNo files scanned yet.", sizing_mode="stretch_width", styles=self._pane_style()) + self.validation = pn.pane.Markdown("### Validation\nNot validated yet.", sizing_mode="stretch_width", styles=self._pane_style()) + self.log = pn.pane.Markdown("### Log\nReady.", sizing_mode="stretch_width", styles=self._pane_style()) + + self.load_movement_button.on_click(self._on_load_movement) + self.scan_geopotential_button.on_click(self._on_scan_geopotential) + self.scan_multilevel_button.on_click(self._on_scan_multilevel) + self.scan_surface_button.on_click(self._on_scan_surface) + self.scan_optional_components_button.on_click(self._on_scan_optional_components) + self.load_boundary_button.on_click(self._on_load_boundary) + self.reset_boundary_button.on_click(self._on_reset_boundary) + self.validate_button.on_click(self._on_validate) + self.run_button.on_click(self._on_run) + self.taxon_multiselect.param.watch(self._update_ids_by_taxon, "value") + self.spatial_interpolation_method.param.watch(self._update_smoothing_options, "value") + + for widget in ( + self.u_file, + self.v_file, + self.w_file, + self.temperature_file, + self.dem_file, + self.topography_source, + self.track_direction_source, + ): + widget.param.watch(self._update_dynamic_states, "value") + + self._wire_variable_split_guards() + self._update_smoothing_options() + self._update_dynamic_states() + + @staticmethod + def _pane_style(): + return {"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"} + + def _append_log(self, message: str) -> None: + old = self.log.object or "### Log\n" + if old.strip() == "### Log\nReady.": + old = "### Log\n" + self.log.object = old + f"\n- {message}" + + @staticmethod + def _file_value(value): + if value is None: + return None + if isinstance(value, (list, tuple, set)): + values = list(value) + return values[0] if values else None + return value + + @staticmethod + def _path_exists(value: str) -> bool: + value = Multidimensional_Annotation_App._file_value(value) + if not value: + return False + path = Path(str(value)).expanduser() + return path.exists() and path.is_file() + + @staticmethod + def _optional_path(value): + value = Multidimensional_Annotation_App._file_value(value) + if not value: + return None + path = Path(str(value)).expanduser() + return str(path) if path.exists() and path.is_file() else None + + @staticmethod + def _guess_column(columns: List[str], candidates: List[str]) -> Optional[str]: + lower_map = {c.lower(): c for c in columns} + normalized_map = {re.sub(r"[-:._\s]+", "_", c.lower()): c for c in columns} + for cand in candidates: + c1 = cand.lower() + c2 = re.sub(r"[-:._\s]+", "_", c1) + if c1 in lower_map: + return lower_map[c1] + if c2 in normalized_map: + return normalized_map[c2] + for col in columns: + cl = col.lower() + cn = re.sub(r"[-:._\s]+", "_", cl) + if any(cand.lower() in cl or re.sub(r"[-:._\s]+", "_", cand.lower()) in cn for cand in candidates): + return col + return columns[0] if columns else None + + @staticmethod + def _open_dataset_for_scan(path): + if xr is None: + raise RuntimeError("xarray is not available.") + if safe_open_nc_with_time_decoding is not None: + return safe_open_nc_with_time_decoding(path) + return xr.open_dataset(path, decode_times=False) + + @staticmethod + def _read_nc_metadata(path_value: str) -> Tuple[List[str], List[str], dict]: + path_value = Multidimensional_Annotation_App._file_value(path_value) + if not path_value: + raise FileNotFoundError("No NetCDF file selected.") + path = Path(str(path_value)).expanduser() + if not path.exists() or not path.is_file(): + raise FileNotFoundError(f"File not found or not a file: {path}") + if path.suffix.lower() != ".nc": + raise ValueError(f"Expected a .nc file, got: {path}") + + ds = Multidimensional_Annotation_App._open_dataset_for_scan(path) + try: + data_vars = sorted([str(v) for v in ds.data_vars]) + all_names = sorted([str(v) for v in ds.variables]) + meta = Multidimensional_Annotation_App._nc_info(ds) + return data_vars, all_names, meta + finally: + try: + ds.close() + except Exception: + pass + + @staticmethod + def _nc_info(ds) -> dict: + names = set(ds.variables) | set(ds.coords) + time_name = next((c for c in ("time", "valid_time", "forecast_time", "verification_time", "datetime", "date") if c in names), None) + lat_name = next((c for c in ("lat", "latitude", "y") if c in names), None) + lon_name = next((c for c in ("lon", "longitude", "long", "x") if c in names), None) + level_name = next((c for c in ("level", "lev", "plev", "pressure", "pressure_level", "isobaricInhPa", "isobaric_in_hPa") if c in names), None) + + time_text = "-" + spatial_text = "-" + level_text = "-" + + if time_name and time_name in ds: + try: + vals = pd.to_datetime(ds[time_name].values) + time_text = f"{vals.min():%Y-%m-%d %H:%M:%S} — {vals.max():%Y-%m-%d %H:%M:%S}" + except Exception: + pass + + if lat_name and lon_name and lat_name in ds and lon_name in ds: + try: + spatial_text = ( + f"lat[{float(ds[lat_name].min()):.3f}..{float(ds[lat_name].max()):.3f}], " + f"lon[{float(ds[lon_name].min()):.3f}..{float(ds[lon_name].max()):.3f}]" + ) + except Exception: + pass + + if level_name and level_name in ds: + try: + vals = ds[level_name].values + if len(vals) <= 20: + level_text = ", ".join([str(v) for v in vals]) + else: + level_text = f"{len(vals)} levels, {vals[0]} … {vals[-1]}" + except Exception: + pass + + return { + "time_name": time_name, + "lat_name": lat_name, + "lon_name": lon_name, + "level_name": level_name, + "time_text": time_text, + "spatial_text": spatial_text, + "level_text": level_text, + } + + def _set_coord_selectors(self, all_names: List[str], meta: dict) -> None: + for widget in (self.nc_time_var, self.nc_lat_var, self.nc_lon_var, self.nc_level_var): + widget.options = all_names + self.nc_time_var.value = meta.get("time_name") if meta.get("time_name") in all_names else None + self.nc_lat_var.value = meta.get("lat_name") if meta.get("lat_name") in all_names else None + self.nc_lon_var.value = meta.get("lon_name") if meta.get("lon_name") in all_names else None + self.nc_level_var.value = meta.get("level_name") if meta.get("level_name") in all_names else None + + def _update_env_info(self, file_name="-", meta=None) -> None: + multilevel = self._unique_values(self.multilevel_continuous_vars.value, self.multilevel_categorical_vars.value) + surface = self._unique_values(self.surface_continuous_vars.value, self.surface_categorical_vars.value) + meta = meta or {} + self.env_info.object = ( + f"File: {file_name}
" + f"Multilevel parameters: {', '.join(multilevel) if multilevel else '-'}
" + f"Surface parameters: {', '.join(surface) if surface else '-'}
" + f"Time range: {meta.get('time_text', '-')}
" + f"Spatial range: {meta.get('spatial_text', '-')}
" + f"Vertical levels: {meta.get('level_text', '-')}
" + ) + + @staticmethod + def _unique_values(*lists): + out, seen = [], set() + for values in lists: + for v in list(values or []): + if v not in seen: + seen.add(v) + out.append(v) + return out + + def _on_load_movement(self, event=None) -> None: + path_value = self._file_value(self.movement_csv.value) + if not path_value: + self._append_log("No movement CSV selected.") + return + + path = Path(str(path_value)).expanduser() + if not path.exists() or not path.is_file(): + self._append_log(f"Movement CSV does not exist or is not a file: {path}") + return + + try: + df_sample = pd.read_csv(path, nrows=100) + full_df = pd.read_csv(path) + except Exception as exc: + self._append_log(f"Failed to read movement CSV: {exc}") + return + + columns = list(df_sample.columns) + self._movement_columns = columns + self._movement_df = full_df + + for widget in (self.id_column, self.time_column, self.lat_column, self.lon_column, self.height_column, self.heading_column): + widget.options = columns + + self.id_column.value = self._guess_column(columns, ["individual_local_identifier", "individual-local-identifier", "id"]) + self.time_column.value = self._guess_column(columns, ["timestamp", "eobs_start_timestamp", "time", "datetime", "date"]) + self.lat_column.value = self._guess_column(columns, ["location_lat", "location-lat", "lat", "latitude"]) + self.lon_column.value = self._guess_column(columns, ["location_lon", "location_long", "location-long", "lon", "longitude"]) + self.height_column.value = self._guess_column(columns, ["height-above-ellipsoid", "height_above_ellipsoid", "height", "altitude", "elevation", "height_above_msl"]) + self.heading_column.value = self._guess_column(columns, ["heading", "bearing", "direction"]) + + self._populate_taxa_ids_and_info(path, full_df) + self._append_log(f"Movement CSV loaded: {len(columns)} column(s) detected.") + self._update_dynamic_states() + self._refresh_preview() + + def _populate_taxa_ids_and_info(self, path: Path, df: pd.DataFrame) -> None: + id_col = self.id_column.value + taxon_col = self._guess_column(list(df.columns), ["individual-taxon-canonical-name", "individual_taxon_canonical_name", "taxon", "species"]) + + ids = sorted(df[id_col].dropna().astype(str).unique()) if id_col and id_col in df.columns else [] + taxa = sorted(df[taxon_col].dropna().astype(str).unique()) if taxon_col and taxon_col in df.columns else [] + + self.id_multiselect.options = ids + self.id_multiselect.value = ids + self.taxon_multiselect.options = taxa + self.taxon_multiselect.value = [] + + time_text = "-" + if self.time_column.value and self.time_column.value in df.columns: + ts = pd.to_datetime(df[self.time_column.value], errors="coerce", dayfirst=True) + if ts.notna().any(): + time_text = f"{ts.min():%Y-%m-%d %H:%M:%S} — {ts.max():%Y-%m-%d %H:%M:%S}" + + spatial_text = "-" + if self.lat_column.value in df.columns and self.lon_column.value in df.columns: + lat = pd.to_numeric(df[self.lat_column.value], errors="coerce") + lon = pd.to_numeric(df[self.lon_column.value], errors="coerce") + if lat.notna().any() and lon.notna().any(): + spatial_text = f"lat[{float(lat.min()):.3f}..{float(lat.max()):.3f}], lon[{float(lon.min()):.3f}..{float(lon.max()):.3f}]" + + self.movement_info.object = ( + f"File: {path.name}
Taxons: {len(taxa)}
IDs: {len(ids)}
" + f"Time range: {time_text}
Spatial range: {spatial_text}
" + ) + + def _update_ids_by_taxon(self, event=None) -> None: + if self._movement_df is None: + return + df = self._movement_df + id_col = self.id_column.value + if not id_col or id_col not in df.columns: + return + taxon_col = self._guess_column(list(df.columns), ["individual-taxon-canonical-name", "individual_taxon_canonical_name", "taxon", "species"]) + selected_taxa = list(self.taxon_multiselect.value or []) + if selected_taxa and taxon_col and taxon_col in df.columns: + ids = sorted(df.loc[df[taxon_col].astype(str).isin(selected_taxa), id_col].dropna().astype(str).unique()) + else: + ids = sorted(df[id_col].dropna().astype(str).unique()) + self.id_multiselect.options = ids + self.id_multiselect.value = ids + self._refresh_preview() + + def _on_scan_geopotential(self, event=None) -> None: + try: + vars_, all_names, meta = self._read_nc_metadata(self.geopotential_file.value) + except Exception as exc: + self._append_log(f"Failed to scan geopotential file: {exc}") + return + self.geopotential_variable.options = vars_ + self.geopotential_variable.value = vars_[0] if vars_ else None + self._set_coord_selectors(all_names, meta) + self._append_log(f"Scanned geopotential file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _on_scan_multilevel(self, event=None) -> None: + try: + vars_, all_names, meta = self._read_nc_metadata(self.multilevel_var_file.value) + except Exception as exc: + self._append_log(f"Failed to scan multilevel file: {exc}") + return + self.multilevel_variable.options = vars_ + self.multilevel_variable.value = vars_[0] if vars_ else None + self.multilevel_continuous_vars.options = vars_ + self.multilevel_categorical_vars.options = vars_ + self.multilevel_continuous_vars.value = vars_ + self.multilevel_categorical_vars.value = [] + if not self.nc_time_var.options: + self._set_coord_selectors(all_names, meta) + self._update_env_info(Path(str(self._file_value(self.multilevel_var_file.value))).name, meta) + self._append_log(f"Scanned multilevel file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _on_scan_surface(self, event=None) -> None: + try: + vars_, _, meta = self._read_nc_metadata(self.surface_var_file.value) + except Exception as exc: + self._append_log(f"Failed to scan surface file: {exc}") + return + self.surface_variable.options = vars_ + self.surface_variable.value = vars_[0] if vars_ else None + self.surface_continuous_vars.options = vars_ + self.surface_categorical_vars.options = vars_ + self.surface_continuous_vars.value = vars_ + self.surface_categorical_vars.value = [] + self._update_env_info(Path(str(self._file_value(self.surface_var_file.value))).name, meta) + self._append_log(f"Scanned surface file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _scan_nc_to_select(self, path_widget, select_widget, label: str) -> None: + if not self._path_exists(path_widget.value): + return + try: + vars_, _, _ = self._read_nc_metadata(path_widget.value) + except Exception as exc: + self._append_log(f"Failed to scan {label} file: {exc}") + return + select_widget.options = vars_ + select_widget.value = vars_[0] if vars_ else None + self._append_log(f"Scanned {label} file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _on_scan_optional_components(self, event=None) -> None: + for path_widget, select_widget, label in [ + (self.u_file, self.u_variable, "U component"), + (self.v_file, self.v_variable, "V component"), + (self.w_file, self.w_variable, "W component"), + (self.temperature_file, self.temperature_variable, "temperature"), + ]: + if self._path_exists(path_widget.value): + self._scan_nc_to_select(path_widget, select_widget, label) + self._update_dynamic_states() + + def _on_load_boundary(self, event=None) -> None: + path = self._optional_path(self.boundary_file.value) + if not path: + self._append_log("No boundary file selected.") + return + try: + if load_vector_extent_info is not None: + loaded_path, south, north, west, east = load_vector_extent_info(path) + else: + if gpd is None: + raise RuntimeError("geopandas is not available.") + gdf = gpd.read_file(path) + west, south, east, north = gdf.total_bounds + loaded_path = path + self.boundary_path = str(loaded_path) + self.boundary_info.object = ( + f"Boundary file: {Path(loaded_path).name}
" + f"Spatial range: lat[{south:.3f}..{north:.3f}], lon[{west:.3f}..{east:.3f}]" + ) + self._append_log(f"Boundary loaded: {Path(loaded_path).name}") + except Exception as exc: + self._append_log(f"Failed to load boundary: {exc}") + + def _on_reset_boundary(self, event=None) -> None: + self.boundary_path = None + self.boundary_info.object = "Boundary file: not selected
Spatial range: = environmental data boundary" + self._append_log("Boundary reset to environmental data boundary.") + + def _update_smoothing_options(self, event=None) -> None: + if self.spatial_interpolation_method.value == "Nearest neighbor": + self.control_smoothing.options = ["1"] + self.control_smoothing.value = "1" + else: + self.control_smoothing.options = ["2", "4", "6", "8"] + if self.control_smoothing.value not in self.control_smoothing.options: + self.control_smoothing.value = "4" + + def _update_dynamic_states(self, *_events) -> None: + has_u = self._path_exists(self.u_file.value) + has_v = self._path_exists(self.v_file.value) + has_w = self._path_exists(self.w_file.value) + has_t = self._path_exists(self.temperature_file.value) + has_dem = self.topography_source.value != "None" and self._path_exists(self.dem_file.value) + + self.dem_units.disabled = self.topography_source.value == "None" + self.dem_reference.disabled = self.topography_source.value == "None" + + wind_ready = has_u and has_v + self.derive_wind_speed_direction.disabled = not wind_ready + self.derive_wind_support_crosswind.disabled = not wind_ready + self.track_direction_source.disabled = not wind_ready + self.heading_column.disabled = not (wind_ready and self.track_direction_source.value == "Use existing heading column") + self.derive_vertical_motion.disabled = not has_w + self.derive_thermal_uplift.disabled = not has_t + self.derive_orographic_uplift.disabled = not (wind_ready and has_dem) + + if not wind_ready: + self.derive_wind_speed_direction.value = False + self.derive_wind_support_crosswind.value = False + if not has_w: + self.derive_vertical_motion.value = False + if not has_t: + self.derive_thermal_uplift.value = False + if not (wind_ready and has_dem): + self.derive_orographic_uplift.value = False + + def _enforce_split_unique(self, first, second, changed: str, new_values: list): + a = list(first.value or []) + b = list(second.value or []) + if changed == "first": + overlap = set(new_values) & set(b) + if overlap: + second.value = [v for v in b if v not in overlap] + else: + overlap = set(new_values) & set(a) + if overlap: + first.value = [v for v in a if v not in overlap] + self._update_env_info() + self._refresh_preview() + + def _wire_variable_split_guards(self): + self.multilevel_continuous_vars.param.watch( + lambda e: self._enforce_split_unique(self.multilevel_continuous_vars, self.multilevel_categorical_vars, "first", list(e.new or [])), + "value", + ) + self.multilevel_categorical_vars.param.watch( + lambda e: self._enforce_split_unique(self.multilevel_continuous_vars, self.multilevel_categorical_vars, "second", list(e.new or [])), + "value", + ) + self.surface_continuous_vars.param.watch( + lambda e: self._enforce_split_unique(self.surface_continuous_vars, self.surface_categorical_vars, "first", list(e.new or [])), + "value", + ) + self.surface_categorical_vars.param.watch( + lambda e: self._enforce_split_unique(self.surface_continuous_vars, self.surface_categorical_vars, "second", list(e.new or [])), + "value", + ) + + def _selected_multilevel_vars(self) -> List[str]: + return self._unique_values(self.multilevel_continuous_vars.value, self.multilevel_categorical_vars.value) + + def _selected_surface_vars(self) -> List[str]: + return self._unique_values(self.surface_continuous_vars.value, self.surface_categorical_vars.value) + + def _first_or_none(self, values): + values = list(values or []) + return values[0] if values else None + + def _refresh_preview(self) -> None: + lines = [ + "### Preview", + f"- **Movement CSV:** `{self.movement_csv.value or '-'}`", + f"- **Selected IDs:** `{len(self.id_multiselect.value or [])}`", + f"- **Height column:** `{self.height_column.value or '-'}`", + f"- **Geopotential file:** `{self.geopotential_file.value or '-'}`", + f"- **Geopotential variable:** `{self.geopotential_variable.value or '-'}`", + f"- **Multilevel file:** `{self.multilevel_var_file.value or '-'}`", + f"- **Continuous multilevel variables:** `{list(self.multilevel_continuous_vars.value or [])}`", + f"- **Categorical multilevel variables:** `{list(self.multilevel_categorical_vars.value or [])}`", + f"- **Surface file:** `{self.surface_var_file.value or '-'}`", + f"- **Continuous surface variables:** `{list(self.surface_continuous_vars.value or [])}`", + f"- **Categorical surface variables:** `{list(self.surface_categorical_vars.value or [])}`", + f"- **Spatial method:** `{self.spatial_interpolation_method.value}`", + f"- **Nearest grid points:** `{self.control_smoothing.value}`", + f"- **Vertical method:** `{self.vertical_matching_method.value}`", + f"- **Boundary file:** `{self.boundary_path or '-'}`", + f"- **Topography source:** `{self.topography_source.value}`", + "", + "**Derived metrics enabled:**", + f"- Wind speed/direction: `{self.derive_wind_speed_direction.value}`", + f"- Wind support/cross wind: `{self.derive_wind_support_crosswind.value}`", + f"- Vertical motion: `{self.derive_vertical_motion.value}`", + f"- Thermal proxy: `{self.derive_thermal_uplift.value}`", + f"- Orographic uplift: `{self.derive_orographic_uplift.value}`", + ] + self.preview.object = "\n".join(lines) + + def _backend_height_reference(self) -> str: + if self.height_reference.value == "WGS84 ellipsoidal height (Movebank GPS height)": + return "ellipsoidal" + if self.height_reference.value == "Height above ground level (requires DEM)": + return "agl" + return "already_orthometric" + + def _backend_heading_source(self) -> str: + return "column" if self.track_direction_source.value == "Use existing heading column" else "compute" + + def _on_validate(self, event=None) -> None: + errors, warnings = [], [] + + if not self._path_exists(self.movement_csv.value): + errors.append("Movement CSV is missing or does not exist.") + for name, widget in [ + ("ID column", self.id_column), + ("Timestamp column", self.time_column), + ("Latitude column", self.lat_column), + ("Longitude column", self.lon_column), + ("Height column", self.height_column), + ]: + if not widget.value: + errors.append(f"{name} is not selected.") + + if not self._path_exists(self.geopotential_file.value): + errors.append("Geopotential file is required and does not exist.") + if not self.geopotential_variable.value: + errors.append("Geopotential variable is not selected.") + if not self._path_exists(self.multilevel_var_file.value): + errors.append("Annotated multilevel variable file is required and does not exist.") + if not self._selected_multilevel_vars(): + errors.append("No multilevel annotation variables selected.") + + if self.surface_var_file.value and self._path_exists(self.surface_var_file.value) and not self._selected_surface_vars(): + errors.append("Surface variable file is set but no surface variable is selected.") + if self.surface_var_file.value and not self._path_exists(self.surface_var_file.value): + warnings.append("Surface variable file is not selected or is not a file; surface variables will be ignored.") + + if not self.id_multiselect.value: + warnings.append("No individual IDs selected; backend currently processes all rows unless ID filtering is implemented.") + if self.spatial_interpolation_method.value == "Inverse Distance Weighting" and int(self.control_smoothing.value) < 2: + errors.append("IDW requires at least 2 nearest grid points.") + if self.use_surface_as_lower_anchor.value and not self.surface_continuous_vars.value: + warnings.append("Surface anchor is enabled, but no continuous surface variable is selected.") + + if self.topography_source.value != "None" and not self._path_exists(self.dem_file.value): + errors.append("Topography source is selected, but DEM file is missing or does not exist.") + if self.height_reference.value == "Height above ground level (requires DEM)" and not self._path_exists(self.dem_file.value): + errors.append("Height reference is AGL, but DEM file is missing or does not exist.") + if self.height_reference.value == "Already orthometric / MSL-like": + warnings.append("Movement height is assumed to be already comparable to ERA5 geopotential height. No geoid correction will be applied.") + if self.height_reference.value == "WGS84 ellipsoidal height (Movebank GPS height)": + warnings.append("Movement height will be converted to MSL/orthometric height using selected geoid correction mode.") + if self.geopotential_units.value == "m2 s-2" and not self.convert_geopotential_to_height.value: + warnings.append("Geopotential units are m2 s-2, but conversion to height is disabled.") + if self.derive_wind_support_crosswind.value and self.track_direction_source.value == "Use existing heading column" and not self.heading_column.value: + errors.append("Heading column is required when using existing heading column.") + + if errors: + lines = ["### Validation", "**Status:** Issues found", "", *[f"- {e}" for e in errors]] + if warnings: + lines += ["", "**Warnings:**", *[f"- {w}" for w in warnings]] + self.validation.object = "\n".join(lines) + self._append_log(f"Validation completed with {len(errors)} error(s).") + else: + lines = ["### Validation", "**Status:** OK", "", "- UI configuration is sufficient for backend run."] + if warnings: + lines += ["", "**Warnings:**", *[f"- {w}" for w in warnings]] + self.validation.object = "\n".join(lines) + self._append_log("Validation completed successfully.") + self._refresh_preview() + + def _on_run(self, event=None) -> None: + self._on_validate() + if "Issues found" in str(self.validation.object): + self._append_log("Run cancelled because validation found errors.") + return + if run_multidimensional_annotation_from_paths is None: + self._append_log(f"Backend import failed: {BACKEND_IMPORT_ERROR}") + return + + self.run_button.disabled = True + self.run_button.name = "Running multidimensional annotation..." + try: + output_csv = Path(str(self.output_csv.value)).expanduser() + output_csv.parent.mkdir(parents=True, exist_ok=True) + + multilevel_cont = list(self.multilevel_continuous_vars.value or []) + multilevel_cat = list(self.multilevel_categorical_vars.value or []) + surface_cont = list(self.surface_continuous_vars.value or []) + surface_cat = list(self.surface_categorical_vars.value or []) + surface_file = ( + self._optional_path(self.surface_var_file.value) + if (surface_cont or surface_cat) + else None + ) + + self._append_log("Starting multidimensional annotation.") + + result = run_multidimensional_annotation_from_paths( + movement_csv=self._optional_path(self.movement_csv.value), + output_csv=str(output_csv), + id_col=self.id_column.value, + selected_ids=list(self.id_multiselect.value) if self.id_multiselect.value else None, + time_col=self.time_column.value, + lat_col=self.lat_column.value, + lon_col=self.lon_column.value, + boundary_path=self.boundary_path or None, + height_col=self.height_column.value, + geopotential_file=self._optional_path(self.geopotential_file.value), + geopotential_variable=self.geopotential_variable.value, + geopotential_units=self.geopotential_units.value, + convert_geopotential_to_height=bool(self.convert_geopotential_to_height.value), + nc_time_var=self.nc_time_var.value or None, + nc_lat_var=self.nc_lat_var.value or None, + nc_lon_var=self.nc_lon_var.value or None, + nc_level_var=self.nc_level_var.value or None, + multilevel_var_file=self._optional_path(self.multilevel_var_file.value), + surface_var_file=surface_file, + multilevel_continuous_vars=multilevel_cont, + multilevel_categorical_vars=multilevel_cat, + surface_continuous_vars=surface_cont, + surface_categorical_vars=surface_cat, + dem_file=self._optional_path(self.dem_file.value), + save_per_individual=bool(self.save_per_individual.value), + keep_diagnostics=bool(self.keep_diagnostics.value), + vertical_matching_method=self.vertical_matching_method.value, + height_reference=self._backend_height_reference(), + geoid_mode=self.geoid_mode.value, + constant_geoid_undulation_m=float(self.constant_geoid_undulation_m.value or 0.0), + u_file=self._optional_path(self.u_file.value), + u_variable=self.u_variable.value if self._path_exists(self.u_file.value) else None, + v_file=self._optional_path(self.v_file.value), + v_variable=self.v_variable.value if self._path_exists(self.v_file.value) else None, + w_file=self._optional_path(self.w_file.value), + w_variable=self.w_variable.value if self._path_exists(self.w_file.value) else None, + temperature_file=self._optional_path(self.temperature_file.value), + temperature_variable=self.temperature_variable.value if self._path_exists(self.temperature_file.value) else None, + derive_wind_speed_direction=bool(self.derive_wind_speed_direction.value), + derive_wind_support_crosswind=bool(self.derive_wind_support_crosswind.value), + derive_vertical_motion=bool(self.derive_vertical_motion.value), + derive_thermal_proxy=bool(self.derive_thermal_uplift.value), + smoothing_k=int(self.control_smoothing.value), + derive_orographic_uplift=bool(self.derive_orographic_uplift.value), + heading_col=self.heading_column.value, + heading_source=self._backend_heading_source(), + ) + + n_rows = len(result) if result is not None else 0 + n_cols = len(result.columns) if result is not None else 0 + self._append_log(f"Annotation completed: {n_rows} row(s), {n_cols} column(s).") + self._append_log(f"Output saved to: {output_csv}") + preview_cols = list(result.columns[:20]) if result is not None else [] + self.preview.object = "\n".join([ + "### Preview", + f"- **Output CSV:** `{output_csv}`", + f"- **Rows:** `{n_rows}`", + f"- **Columns:** `{n_cols}`", + "", + "**First output columns:**", + *[f"- `{col}`" for col in preview_cols], + ]) + except Exception as exc: + logger.exception("Multidimensional annotation failed.") + self._append_log(f"Annotation failed: {exc}") + finally: + self.run_button.disabled = False + self.run_button.name = "Run multidimensional annotation" + + def _card(self, title: str, *items): + return pn.Card( + pn.Column(*items, sizing_mode="stretch_width"), + title=title, + collapsible=True, + collapsed=False, + sizing_mode="stretch_width", + margin=0, + styles={"margin": "0px", "border-radius": "0px"}, + ) + + def view(self): + COL_H = 3500 + col1 = pn.Column( + self._card( + "1. Movement data", + self.movement_csv, + self.load_movement_button, + self.taxon_multiselect, + self.id_multiselect, + self.id_column, + self.time_column, + self.lat_column, + self.lon_column, + self.height_column, + self.height_units, + self.height_reference, + self.geoid_mode, + self.constant_geoid_undulation_m, + self.movement_info, + ), + self._card( + "2. Vertical reference / geopotential", + self.geopotential_file, + self.scan_geopotential_button, + self.geopotential_variable, + self.nc_time_var, + self.nc_lat_var, + self.nc_lon_var, + self.nc_level_var, + self.geopotential_units, + self.convert_geopotential_to_height, + self.gravity_constant, + ), + sizing_mode="stretch_width", + height=COL_H, + margin=0, + styles={"gap": "0px", "display": "flex", "flex-direction": "column"}, + ) + + col2 = pn.Column( + self._card( + "3. Annotation variables", + self.multilevel_var_file, + self.scan_multilevel_button, + self.multilevel_continuous_vars, + self.multilevel_categorical_vars, + self.surface_var_file, + self.scan_surface_button, + self.surface_continuous_vars, + self.surface_categorical_vars, + self.env_info, + ), + self._card( + "4. Optional atmospheric components", + self.u_file, + self.u_variable, + self.v_file, + self.v_variable, + self.w_file, + self.w_variable, + self.temperature_file, + self.temperature_variable, + self.scan_optional_components_button, + ), + sizing_mode="stretch_width", + height=COL_H, + margin=0, + styles={"gap": "0px", "display": "flex", "flex-direction": "column"}, + ) + + col3 = pn.Column( + self._card("5. Boundary data", self.boundary_file, pn.Row(self.load_boundary_button, self.reset_boundary_button), self.boundary_info), + self._card( + "6. Interpolation / vertical matching", + self.spatial_interpolation_method, + self.control_smoothing, + self.vertical_matching_method, + self.use_surface_as_lower_anchor, + self.surface_anchor_height_agl_m, + ), + self._card("7. Topography", self.topography_source, self.dem_file, self.dem_units, self.dem_reference), + self._card( + "8. Derived metrics", + self.derive_wind_speed_direction, + self.derive_wind_support_crosswind, + self.track_direction_source, + self.heading_column, + self.derive_vertical_motion, + self.derive_thermal_uplift, + self.derive_orographic_uplift, + ), + self._card( + "9. Output", + self.output_csv, + self.save_per_individual, + self.keep_diagnostics, + self.validate_button, + self.run_button, + ), + sizing_mode="stretch_width", + height=COL_H, + margin=0, + styles={"gap": "0px", "display": "flex", "flex-direction": "column"}, + ) + + return pn.Column( + "# Multidimensional Annotation Engine App (DEMO)", + pn.GridBox(col1, col2, col3, ncols=3, sizing_mode="stretch_width", height=COL_H, scroll=True), + pn.Row(self.preview, self.validation, self.log, sizing_mode="stretch_width"), + sizing_mode="stretch_width", + ) + + +@register_view(ext_args=["floatpanel"]) +def view(): + app = Multidimensional_Annotation_App() + template = DEFAULT_TEMPLATE(main=[app.view()], sidebar=[]) + return template + + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/app/apps/nc_builder_app.py b/ecodata/app/apps/nc_builder_app.py new file mode 100644 index 0000000..398dc81 --- /dev/null +++ b/ecodata/app/apps/nc_builder_app.py @@ -0,0 +1,855 @@ +import logging +from pathlib import Path +from typing import Dict, List, Optional + +import panel as pn +import pandas as pd + +from ecodata.app.config import DEFAULT_TEMPLATE +from ecodata.app.models import FileSelector +from ecodata.panel_utils import register_view + +logger = logging.getLogger(__file__) + +BACKEND_IMPORT_ERROR = None + +try: + from ecodata.nc_builder_functions import ( + NCBuildConfig, + build_standardized_netcdf, + scan_netcdf_files, + validate_build_config, + ) +except Exception as exc: + BACKEND_IMPORT_ERROR = str(exc) + NCBuildConfig = None + build_standardized_netcdf = None + scan_netcdf_files = None + validate_build_config = None + + +class NCBuilder_App: + """ + UI for building a standardized CF-style NetCDF file from multiple ERA5 or generic NetCDF files. + """ + + def __init__(self): + self.name = "NetCDF Builder" + self._scanned_files: List[Path] = [] + self._detected_time_min: Optional[pd.Timestamp] = None + self._detected_time_max: Optional[pd.Timestamp] = None + + # 1. Input files + self.input_folder = FileSelector( + name="Input folder", + constrain_path=False, + expanded=True, + size=10, + ) + + self.input_files = pn.widgets.MultiSelect( + name="Select files from current folder", + options={}, + value=[], + size=12, + sizing_mode="stretch_width", + ) + + self.combine_mode = pn.widgets.RadioButtonGroup( + name="Combine mode", + options=["By time", "By level", "By time and level"], + value="By time and level", + button_type="primary", + sizing_mode="stretch_width", + ) + + # 2. Variable and coordinate mapping + self.target_variable = pn.widgets.MultiSelect( + name="Target variable(s)", + options=[], + value=[], + size=8, + sizing_mode="stretch_width", + ) + self.time_variable = pn.widgets.Select(name="Time variable", options=[], value=None, sizing_mode="stretch_width") + self.lat_variable = pn.widgets.Select(name="Latitude variable", options=[], value=None, sizing_mode="stretch_width") + self.lon_variable = pn.widgets.Select(name="Longitude variable", options=[], value=None, sizing_mode="stretch_width") + self.level_variable = pn.widgets.Select(name="Vertical / level variable", options=["None"], value="None", sizing_mode="stretch_width") + + self.output_variable_name = pn.widgets.TextInput( + name="Output variable name", + placeholder="Example: temperature", + value="", + sizing_mode="stretch_width", + ) + self.output_level_coord_name = pn.widgets.TextInput( + name="Output level coordinate name", + value="level", + sizing_mode="stretch_width", + ) + self.level_units = pn.widgets.Select( + name="Level units", + options=["hPa", "m", "Pa", "model_level", "custom"], + value="hPa", + sizing_mode="stretch_width", + ) + self.level_units_custom = pn.widgets.TextInput( + name="Custom level units", + placeholder="Example: sigma, hybrid_level, depth_m", + value="", + disabled=True, + sizing_mode="stretch_width", + ) + + self.cf_note = pn.pane.Markdown( + ( + "**Standard output coordinate names:** `time`, `lat`, `lon`, `level` \n" + "The backend writes basic CF-style metadata for coordinate attributes." + ), + sizing_mode="stretch_width", + ) + + # 3. Level detection + self.level_source = pn.widgets.Select( + name="Level source", + options=["From NetCDF coordinate", "From filename", "Manual table"], + value="From NetCDF coordinate", + sizing_mode="stretch_width", + ) + self.level_regex = pn.widgets.TextInput( + name="Level regex", + value=r"level(\d+)", + placeholder=r"Example: level(\d+)", + sizing_mode="stretch_width", + ) + self.level_table_path = pn.widgets.TextInput( + name="Level table file", + placeholder="CSV with columns: name, level", + value="", + sizing_mode="stretch_width", + ) + self.level_table_note = pn.pane.Markdown( + ( + "**Manual level table format:** CSV with columns `name` and `level`. \n" + "`name` should match the input file name or a unique part of it." + ), + sizing_mode="stretch_width", + ) + + # 4. Time detection + self.time_source = pn.widgets.Select( + name="Time source", + options=["From NetCDF time coordinate", "From filename", "Manual table"], + value="From NetCDF time coordinate", + sizing_mode="stretch_width", + ) + self.time_regex = pn.widgets.TextInput( + name="Time regex", + value=r"(\d{8})", + placeholder=r"Example: (\d{8}) for YYYYMMDD", + sizing_mode="stretch_width", + ) + self.time_format = pn.widgets.TextInput( + name="Time format", + value="%Y%m%d", + placeholder="Example: %Y%m%d or %Y-%m-%d_%H", + sizing_mode="stretch_width", + ) + self.time_table_path = pn.widgets.TextInput( + name="Time table file", + placeholder="CSV with columns: name, DateTime", + value="", + sizing_mode="stretch_width", + ) + self.time_table_note = pn.pane.Markdown( + ( + "**Manual time table format:** CSV with columns `name` and `DateTime`. \n" + "`name` should match the input file name or a unique part of it. \n" + "`DateTime` should be parseable by pandas, e.g. `1994-01-01 00:00:00`." + ), + sizing_mode="stretch_width", + ) + + # 5. Spatial subset + self.use_bbox = pn.widgets.Checkbox(name="Bounding box", value=False, sizing_mode="stretch_width") + self.bbox_south = pn.widgets.FloatInput(name="South", value=None, step=0.25) + self.bbox_north = pn.widgets.FloatInput(name="North", value=None, step=0.25) + self.bbox_west = pn.widgets.FloatInput(name="West", value=None, step=0.25) + self.bbox_east = pn.widgets.FloatInput(name="East", value=None, step=0.25) + self.bbox_note = pn.pane.Markdown( + "If the bounding box is not enabled, the original spatial extent is preserved.", + sizing_mode="stretch_width", + ) + + # 6. Time subset + self.detected_time_range = pn.pane.Markdown("**Detected time range:** not scanned yet", sizing_mode="stretch_width") + self.start_time = pn.widgets.DatetimePicker(name="Start time", value=None, sizing_mode="stretch_width") + self.end_time = pn.widgets.DatetimePicker(name="End time", value=None, sizing_mode="stretch_width") + self.time_subset_note = pn.pane.Markdown( + ( + "If input files do not contain a time coordinate, use **Time source = From filename** " + "or **Manual table**. If no time information is provided, all files will be used." + ), + sizing_mode="stretch_width", + ) + + # 7. Output settings + self.output_folder = pn.widgets.TextInput( + name="Output folder", + placeholder="Path to output folder", + value=str(Path.home() / "Downloads"), + sizing_mode="stretch_width", + ) + self.output_filename = pn.widgets.TextInput( + name="Output filename", + value="era5_standardized_temperature.nc", + sizing_mode="stretch_width", + ) + self.output_mode = pn.widgets.Select( + name="Output mode", + options=["Single NetCDF file"], + value="Single NetCDF file", + sizing_mode="stretch_width", + ) + self.use_dask_chunks = pn.widgets.Checkbox(name="Use chunking when reading", value=False, sizing_mode="stretch_width") + self.chunking_mode = pn.widgets.Select(name="Chunking mode", options=["auto", "manual"], value="auto", sizing_mode="stretch_width") + self.chunk_time = pn.widgets.IntInput(name="time chunk", value=24, start=1, step=1, disabled=True) + self.chunk_level = pn.widgets.IntInput(name="level chunk", value=1, start=1, step=1, disabled=True) + self.chunk_lat = pn.widgets.IntInput(name="lat chunk", value=200, start=1, step=10, disabled=True) + self.chunk_lon = pn.widgets.IntInput(name="lon chunk", value=200, start=1, step=10, disabled=True) + self.enable_compression = pn.widgets.Checkbox(name="Enable NetCDF compression", value=True, sizing_mode="stretch_width") + + # Preview / validation / log + self.preview = pn.pane.Markdown( + "### Preview\nNo files scanned yet.", + sizing_mode="stretch_width", + styles={"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"}, + ) + self.validation_panel = pn.pane.Markdown( + "### Validation\nNot validated yet.", + sizing_mode="stretch_width", + styles={"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"}, + ) + self.log = pn.pane.Markdown( + "### Log\nReady.", + sizing_mode="stretch_width", + styles={"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"}, + ) + + # Buttons + self.load_files_button = pn.widgets.Button( + name="Load file list", + button_type="primary", + sizing_mode="stretch_width", + ) + + self.scan_variables_button = pn.widgets.Button( + name="Scan variables", + button_type="primary", + sizing_mode="stretch_width", + ) + self.validate_button = pn.widgets.Button( + name="Validate", + button_type="primary", + sizing_mode="stretch_width", + ) + + self.build_button = pn.widgets.Button( + name="Build standardized NetCDF", + button_type="primary", + sizing_mode="stretch_width", + ) + + self.load_files_button.on_click(self._on_load_file_list) + self.scan_variables_button.on_click(self._on_scan_variables) + self.validate_button.on_click(self._on_validate) + self.build_button.on_click(self._on_build) + self.target_variable.param.watch(self._on_target_variables_changed, "value") + self.level_units.param.watch(self._update_widget_states, "value") + self.level_source.param.watch(self._update_widget_states, "value") + self.time_source.param.watch(self._update_widget_states, "value") + self.use_bbox.param.watch(self._update_widget_states, "value") + self.chunking_mode.param.watch(self._update_widget_states, "value") + self.use_dask_chunks.param.watch(self._update_widget_states, "value") + self.combine_mode.param.watch(self._update_widget_states, "value") + self._update_widget_states() + + def _append_log(self, message: str) -> None: + old = self.log.object or "### Log\n" + if old.strip() == "### Log\nReady.": + old = "### Log\n" + self.log.object = old + f"\n- {message}" + + def _current_input_directory(self) -> Optional[Path]: + """ + Return the input folder represented by the custom FileSelector. + + The custom selector is used only to define the folder. + If the selector value is a file, NCBuilder uses its parent folder. + The actual file list for scan/validate/build is controlled by self.input_files. + """ + candidates = [ + getattr(self.input_folder, "value", None), + getattr(self.input_folder, "directory", None), + ] + + for raw_value in candidates: + if not raw_value: + continue + + if isinstance(raw_value, (list, tuple)): + if not raw_value: + continue + raw_value = raw_value[0] + + path = Path(str(raw_value)).expanduser() + + if path.exists() and path.is_file(): + return path.parent + + if path.exists() and path.is_dir(): + return path + + return None + + + def _list_netcdf_files_in_selected_folder(self) -> List[Path]: + """ + List supported NetCDF-like files in the current input folder. + """ + folder = self._current_input_directory() + if folder is None: + return [] + + extensions = {".nc", ".nc4", ".cdf", ".netcdf"} + + files = [ + p for p in folder.iterdir() + if p.is_file() and p.suffix.lower() in extensions + ] + + return sorted(files, key=lambda p: p.name.lower()) + + + def _refresh_input_file_options(self) -> None: + """ + Load all supported NetCDF files from the current FileSelector directory + into the MultiSelect. + + This method controls what files are visible in the UI. + It does not decide what files will be passed to the backend. + """ + files = self._list_netcdf_files_in_selected_folder() + + options = { + f.name: str(f) + for f in files + if f.exists() and f.is_file() + } + + self.input_files.options = options + + # When a new folder is opened, select all detected files by default. + # The user can then deselect files manually. + self.input_files.value = list(options.values()) + + + def _on_load_file_list(self, event=None) -> None: + """ + Load all supported NetCDF files from the current custom FileSelector folder + into the MultiSelect. + + The custom FileSelector is used only to define the folder. + The actual files passed to scan/validate/build are controlled by + self.input_files.value. + """ + self.log.object = "### Log\n" + + folder = self._current_input_directory() + + if folder is None: + selector_value = getattr(self.input_folder, "value", None) + selector_directory = getattr(self.input_folder, "directory", None) + + self.input_files.options = {} + self.input_files.value = [] + + self.preview.object = ( + "### Preview\n" + "No valid input folder was detected from the custom selector.\n\n" + f"- `FileSelector.value`: `{selector_value}`\n" + f"- `FileSelector.directory`: `{selector_directory}`\n\n" + "Open the target folder or click any file inside that folder, then press **Load file list**." + ) + self._append_log("No valid input folder detected from FileSelector.") + return + + self._refresh_input_file_options() + + n_files = len(self.input_files.options or {}) + + self.preview.object = ( + "### Preview\n" + f"- **Input folder:** `{folder}`\n" + f"- **Files loaded into Select files from current folder:** {n_files}\n" + "- Deselect files that should not be scanned or built." + ) + + if n_files == 0: + self._append_log( + f"No supported NetCDF files found in `{folder}`. " + "Expected extensions: .nc, .nc4, .cdf, .netcdf." + ) + else: + self._append_log(f"Loaded {n_files} NetCDF file(s) from `{folder}`.") + + def _collect_input_files(self) -> List[Path]: + """ + Collect only files explicitly selected in the MultiSelect. + + MultiSelect options may contain all files from the folder, + but only MultiSelect value is passed to scan/validate/build. + """ + selected_values = list(self.input_files.value or []) + + files: List[Path] = [] + + for value in selected_values: + path = Path(str(value)).expanduser() + if path.exists() and path.is_file(): + files.append(path) + + unique_files: List[Path] = [] + seen = set() + + for f in files: + key = str(f.resolve()) if f.exists() else str(f) + if key not in seen: + seen.add(key) + unique_files.append(f) + + return unique_files + + def _sync_selected_files(self) -> List[Path]: + """ + Synchronize backend file list with the current MultiSelect selection. + """ + files = self._collect_input_files() + self._scanned_files = [ + Path(f).expanduser() + for f in files + if Path(f).expanduser().exists() + ] + return self._scanned_files + + def _on_target_variables_changed(self, event=None) -> None: + """ + Update output-name behaviour depending on single-variable or multi-variable mode. + + In multi-variable mode, source variable names are preserved, so the single + output variable name field is disabled. + """ + selected_targets = list(self.target_variable.value or []) + + if len(selected_targets) == 1: + self.output_variable_name.disabled = False + if not self.output_variable_name.value: + self.output_variable_name.value = selected_targets[0] + elif len(selected_targets) > 1: + self.output_variable_name.value = "" + self.output_variable_name.disabled = True + else: + self.output_variable_name.disabled = False + + def _manual_chunks_dict(self) -> Dict[str, int]: + return { + "time": int(self.chunk_time.value), + "level": int(self.chunk_level.value), + "lat": int(self.chunk_lat.value), + "lon": int(self.chunk_lon.value), + } + + def _update_widget_states(self, *_events) -> None: + self.level_units_custom.disabled = self.level_units.value != "custom" + + self.level_variable.disabled = self.level_source.value != "From NetCDF coordinate" + self.level_regex.disabled = self.level_source.value != "From filename" + self.level_table_path.disabled = self.level_source.value != "Manual table" + + self.time_variable.disabled = self.time_source.value != "From NetCDF time coordinate" + self.time_regex.disabled = self.time_source.value != "From filename" + self.time_format.disabled = self.time_source.value != "From filename" + self.time_table_path.disabled = self.time_source.value != "Manual table" + + bbox_disabled = not self.use_bbox.value + for widget in (self.bbox_south, self.bbox_north, self.bbox_west, self.bbox_east): + widget.disabled = bbox_disabled + + manual_chunks = self.use_dask_chunks.value and self.chunking_mode.value == "manual" + self.chunking_mode.disabled = not self.use_dask_chunks.value + for widget in (self.chunk_time, self.chunk_level, self.chunk_lat, self.chunk_lon): + widget.disabled = not manual_chunks + # In "By time" mode, the selected files already define the time range. + # Avoid applying an additional pandas-based time subset, especially for + # cftime calendars such as Julian/noleap/360_day. + time_subset_disabled = self.combine_mode.value == "By time" + + self.start_time.disabled = time_subset_disabled + self.end_time.disabled = time_subset_disabled + + if time_subset_disabled: + self.time_subset_note.object = ( + "In **By time** mode, time subsetting is disabled. " + "Select the required files in **Select files from current folder** instead. " + "The detected time range is shown for information only." + ) + else: + self.time_subset_note.object = ( + "If input files do not contain a time coordinate, use **Time source = From filename** " + "or **Manual table**. If no time information is provided, all files will be used." + ) + def _make_bbox_config(self) -> Optional[Dict[str, float]]: + if not self.use_bbox.value: + return None + return { + "south": float(self.bbox_south.value), + "north": float(self.bbox_north.value), + "west": float(self.bbox_west.value), + "east": float(self.bbox_east.value), + } + + def _make_output_path(self) -> str: + folder = Path(self.output_folder.value or ".").expanduser() + filename = self.output_filename.value or "standardized_output.nc" + return str(folder / filename) + + def _make_build_config(self): + if NCBuildConfig is None: + raise RuntimeError(f"NCBuilder backend functions are not available. Import error: {BACKEND_IMPORT_ERROR}") + + manual_chunks = None + if self.use_dask_chunks.value and self.chunking_mode.value == "manual": + manual_chunks = self._manual_chunks_dict() + + level_units = self.level_units_custom.value if self.level_units.value == "custom" else self.level_units.value + + level_variable = self.level_variable.value + if level_variable == "None": + level_variable = None + target_variables = list(self.target_variable.value or []) + target_variable = target_variables[0] if target_variables else None + self._sync_selected_files() + + if self.combine_mode.value == "By time": + start_time = None + end_time = None + else: + start_time = str(self.start_time.value) if self.start_time.value else None + end_time = str(self.end_time.value) if self.end_time.value else None + + return NCBuildConfig( + files=[str(p) for p in self._scanned_files], + combine_mode=self.combine_mode.value, + target_variable=target_variable, + output_variable_name=self.output_variable_name.value or target_variable, + target_variables=target_variables, + lat_variable=self.lat_variable.value, + lon_variable=self.lon_variable.value, + time_source=self.time_source.value, + time_variable=self.time_variable.value, + time_regex=self.time_regex.value, + time_format=self.time_format.value, + time_table_path=self.time_table_path.value or None, + level_source=self.level_source.value, + level_variable=level_variable, + level_regex=self.level_regex.value, + level_table_path=self.level_table_path.value or None, + output_level_coord_name=self.output_level_coord_name.value or "level", + level_units=level_units, + bbox=self._make_bbox_config(), + start_time=start_time, + end_time=end_time, + output_path=self._make_output_path(), + use_dask_chunks=bool(self.use_dask_chunks.value), + chunking_mode=self.chunking_mode.value, + manual_chunks=manual_chunks, + enable_compression=bool(self.enable_compression.value), + convert_longitude_to_180=True, + open_engine="auto", + use_modis_time_encoding=True, + ) + + def _on_scan_variables(self, event=None) -> None: + self.log.object = "### Log\n" + + self._sync_selected_files() + + if not self._scanned_files: + self.preview.object = ( + "### Preview\n" + "No NetCDF files are selected. First click **Load file list**, " + "then keep one or more files selected in **Select files from current folder**." + ) + self._append_log("No NetCDF files selected.") + return + + self._append_log(f"Found {len(self._scanned_files)} existing NetCDF file(s).") + + if scan_netcdf_files is None: + self.preview.object = ( + "### Preview\nBackend scan function is not available.\n\n" + f"Import error: `{BACKEND_IMPORT_ERROR}`" + ) + self._append_log("Backend scan function is not available.") + return + + try: + meta = scan_netcdf_files( + self._scanned_files, + max_scan=10, + use_dask_chunks=bool(self.use_dask_chunks.value), + chunking_mode=self.chunking_mode.value, + manual_chunks=self._manual_chunks_dict() if self.chunking_mode.value == "manual" else None, + ) + except Exception as exc: + self.preview.object = f"### Preview\nScan failed: `{exc}`" + self._append_log(f"Scan failed: {exc}") + return + + variables = meta.get("variables", []) + all_names = meta.get("all_names", []) + + self.target_variable.options = variables + self.target_variable.value = [variables[0]] if variables else [] + + self.time_variable.options = all_names + self.lat_variable.options = all_names + self.lon_variable.options = all_names + self.level_variable.options = ["None"] + all_names + + self.time_variable.value = meta.get("suggested_time") + self.lat_variable.value = meta.get("suggested_lat") + self.lon_variable.value = meta.get("suggested_lon") + suggested_level = meta.get("suggested_level") + self.level_variable.value = suggested_level if suggested_level else "None" + + if not self.time_variable.value: + self.time_source.value = "From filename" + self._append_log("No obvious time variable detected. Time source was set to 'From filename'.") + + selected_targets = list(self.target_variable.value or []) + if selected_targets: + first_target = selected_targets[0] + + if len(selected_targets) == 1: + self.output_variable_name.value = str(first_target) + if not self.output_filename.value or self.output_filename.value == "era5_standardized_temperature.nc": + self.output_filename.value = f"standardized_{first_target}.nc" + else: + # In multi-variable mode the backend keeps original variable names. + # The output_variable_name field is only meaningful for single-variable mode. + self.output_variable_name.value = "" + if not self.output_filename.value or self.output_filename.value == "era5_standardized_temperature.nc": + self.output_filename.value = "standardized_multivariable.nc" + + self._detected_time_min = pd.to_datetime(meta.get("time_min")) if meta.get("time_min") else None + self._detected_time_max = pd.to_datetime(meta.get("time_max")) if meta.get("time_max") else None + + if self._detected_time_min is not None and self._detected_time_max is not None: + self.start_time.value = self._detected_time_min.to_pydatetime() + self.end_time.value = self._detected_time_max.to_pydatetime() + self.detected_time_range.object = ( + f"**Detected time range:** {self._detected_time_min} → {self._detected_time_max}" + ) + else: + self.detected_time_range.object = "**Detected time range:** not detected from NetCDF coordinates" + + warnings = meta.get("warnings", []) + preview_lines = [ + "### Preview", + f"- **Candidate files:** {len(self._scanned_files)}", + f"- **Scanned files:** {meta.get('scanned_count', 0)}", + f"- **Detected variables:** {', '.join(variables) if variables else '-'}", + f"- **Detected coordinates:** {', '.join(meta.get('coords', [])) if meta.get('coords') else '-'}", + f"- **Detected dimensions:** {', '.join(meta.get('dims', [])) if meta.get('dims') else '-'}", + f"- **Combine mode:** {self.combine_mode.value}", + f"- **Target variable(s):** {', '.join(self.target_variable.value) if self.target_variable.value else '-'}", + f"- **Time variable:** {self.time_variable.value or '-'}", + f"- **Latitude variable:** {self.lat_variable.value or '-'}", + f"- **Longitude variable:** {self.lon_variable.value or '-'}", + f"- **Level variable:** {self.level_variable.value or 'None'}", + f"- **Time source:** {self.time_source.value}", + f"- **Level source:** {self.level_source.value}", + ] + if warnings: + preview_lines.append("\n**Warnings:**") + preview_lines.extend([f"- {w}" for w in warnings]) + self.preview.object = "\n".join(preview_lines) + self._append_log("Scan complete.") + + def _on_validate(self, event=None) -> None: + if validate_build_config is None: + self.validation_panel.object = ( + "### Validation\nBackend validation function is not available.\n\n" + f"Import error: `{BACKEND_IMPORT_ERROR}`" + ) + self._append_log("Backend validation function is not available.") + return + + try: + config = self._make_build_config() + ok, errors, warnings = validate_build_config(config) + except Exception as exc: + self.validation_panel.object = f"### Validation\nValidation setup failed: `{exc}`" + self._append_log(f"Validation setup failed: {exc}") + return + + if ok: + lines = [ + "### Validation", + "**Status:** OK", + "", + "- UI settings are sufficient for the backend build step.", + "- Backend will also check grid compatibility during build.", + ] + if warnings: + lines.append("") + lines.append("**Warnings:**") + lines.extend([f"- {w}" for w in warnings]) + self.validation_panel.object = "\n".join(lines) + self._append_log("Validation completed successfully.") + else: + lines = ["### Validation", "**Status:** Issues found", ""] + lines.extend([f"- {e}" for e in errors]) + if warnings: + lines.append("") + lines.append("**Warnings:**") + lines.extend([f"- {w}" for w in warnings]) + self.validation_panel.object = "\n".join(lines) + self._append_log(f"Validation completed with {len(errors)} error(s).") + + def _on_build(self, event=None) -> None: + if build_standardized_netcdf is None: + self._append_log(f"Backend build function is not available. Import error: {BACKEND_IMPORT_ERROR}") + return + + try: + config = self._make_build_config() + ok, errors, warnings = validate_build_config(config) + if not ok: + self.validation_panel.object = ( + "### Validation\n**Status:** Issues found\n\n" + + "\n".join(f"- {e}" for e in errors) + ) + self._append_log("Build stopped because validation failed.") + return + + self._append_log("Build started.") + manifest = build_standardized_netcdf(config) + self._append_log(f"Build complete: `{manifest['output_path']}`") + self._append_log(f"Manifest saved: `{manifest['manifest_path']}`") + + self.preview.object = ( + "### Build result\n" + f"- **Output file:** `{manifest['output_path']}`\n" + f"- **Manifest:** `{manifest['manifest_path']}`\n" + f"- **Output dimensions:** `{manifest['output_dims']}`\n" + f"- **Output variables:** {', '.join(manifest['output_variables'])}\n" + f"- **Output coordinates:** {', '.join(manifest['output_coords'])}" + ) + except Exception as exc: + self._append_log(f"Build failed: {exc}") + self.validation_panel.object = f"### Validation / Build error\n`{exc}`" + + def view(self): + input_col = pn.Column( + "## 1. Input files", + self.input_folder, + self.load_files_button, + self.input_files, + self.combine_mode, + self.scan_variables_button, + sizing_mode="stretch_width", + ) + + mapping_col = pn.Column( + "## 2. Variables, coordinates and time", + self.target_variable, + self.time_variable, + self.lat_variable, + self.lon_variable, + self.level_variable, + pn.layout.Divider(), + self.output_variable_name, + self.output_level_coord_name, + self.level_units, + self.level_units_custom, + self.cf_note, + pn.layout.Divider(), + "## 3. Level detection", + self.level_source, + self.level_regex, + self.level_table_path, + self.level_table_note, + pn.layout.Divider(), + "## 4. Time detection", + self.time_source, + self.time_regex, + self.time_format, + self.time_table_path, + self.time_table_note, + sizing_mode="stretch_width", + ) + + subset_output_col = pn.Column( + "## 5. Spatial subset", + self.use_bbox, + pn.Row(self.bbox_south, self.bbox_north, sizing_mode="stretch_width"), + pn.Row(self.bbox_west, self.bbox_east, sizing_mode="stretch_width"), + self.bbox_note, + pn.layout.Divider(), + "## 6. Time subset", + self.detected_time_range, + self.start_time, + self.end_time, + self.time_subset_note, + pn.layout.Divider(), + "## 7. Output settings", + self.output_folder, + self.output_filename, + self.output_mode, + self.use_dask_chunks, + self.chunking_mode, + pn.Row(self.chunk_time, self.chunk_level, sizing_mode="stretch_width"), + pn.Row(self.chunk_lat, self.chunk_lon, sizing_mode="stretch_width"), + self.enable_compression, + self.validate_button, + self.build_button, + sizing_mode="stretch_width", + ) + + main = pn.Column( + "# NetCDF Builder", + pn.Row(input_col, mapping_col, subset_output_col, sizing_mode="stretch_width"), + pn.Row(self.preview, self.validation_panel, self.log, sizing_mode="stretch_width"), + sizing_mode="stretch_width", + ) + return main + + +@register_view(ext_args=["floatpanel"]) +def view(): + app = NCBuilder_App() + template = DEFAULT_TEMPLATE( + main=[app.view()], + sidebar=[], + ) + return template + + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/app/apps/presence_data_preparation_app.py b/ecodata/app/apps/presence_data_preparation_app.py new file mode 100644 index 0000000..ddd3f45 --- /dev/null +++ b/ecodata/app/apps/presence_data_preparation_app.py @@ -0,0 +1,767 @@ +""" +eBird data preparation app for ECODATA-Prepare. + +Provides UI to: +- Select EBD + Sampling Event tables using local file selectors +- Select region polygon using a local file selector, or use a bounding box +- Configure vetting filters +- Aggregate by time and export files usable by ECODATA-Animate +""" + +from __future__ import annotations + +import datetime as dt +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + +import panel as pn +import pandas as pd +import numpy as np + +from ecodata.app.config import DEFAULT_TEMPLATE +from ecodata.app.models import FileSelector +from ecodata.panel_utils import register_view +from ecodata.presence_functions import ( + VettingOptions, + AggregationOptions, + aggregate_ebird_to_files, + export_tracks_from_aggregated_counts, + read_species_from_agg_counts, +) + + +def _ensure_dir(path: str) -> str: + """Create directory if missing and return absolute path.""" + path = os.path.abspath(path) + os.makedirs(path, exist_ok=True) + return path + + +def _safe_filename(s: str, default: str = "output") -> str: + """Return filesystem-safe filename.""" + s = (s or "").strip() + s = re.sub(r"[^A-Za-z0-9_.-]+", "_", s) + return s if s else default + + +@dataclass +class OutputPaths: + """Container for output file paths.""" + out_dir: str + agg_counts_csv: str + agg_presence_csv: str + tracks_csv: str + manifest_json: str + + +class EbirdPrepareApp: + """Panel app for preparing eBird data for ECODATA-Animate.""" + + def __init__(self): + self._paths: Optional[OutputPaths] = None + self._region_id: str = "region_1" + + def make_file_selector(name: str, file_pattern: str = "*") -> FileSelector: + return FileSelector( + name=name, + directory=str(Path.home()), + file_pattern=file_pattern, + only_files=True, + constrain_path=False, + expanded=True, + size=10, + sizing_mode="stretch_width", + ) + + self.source_mode = pn.widgets.RadioButtonGroup( + name="Data source", + options=[ + "EBD file", + "Sampling Event file", + "Region polygon ", + ], + value="EBD file", + button_type="primary", + ) + + self.ebd_path = make_file_selector( + "EBD local path", + "*", + ) + self.sampling_path = make_file_selector( + "Sampling local path", + "*", + ) + self.polygon_path = make_file_selector( + "Region polygon local path (shapefile or GeoJSON)", + "*", + ) + + self.spatial_filter_mode = pn.widgets.RadioButtonGroup( + name="Spatial filter", + options=["Region polygon", "Bounding box"], + value="Region polygon", + button_type="primary", + ) + + self.bbox_west = pn.widgets.FloatInput( + name="West / min longitude", + value=-88.5, + step=0.1, + ) + self.bbox_south = pn.widgets.FloatInput( + name="South / min latitude", + value=30.1, + step=0.1, + ) + self.bbox_east = pn.widgets.FloatInput( + name="East / max longitude", + value=-84.8, + step=0.1, + ) + self.bbox_north = pn.widgets.FloatInput( + name="North / max latitude", + value=35.1, + step=0.1, + ) + + self.bbox_help = pn.pane.Markdown( + "Use geographic coordinates in EPSG:4326. \n" + "- longitude: -180 … 180 \n" + "- latitude: -90 … 90 \n" + "- west < east, south < north", + sizing_mode="stretch_width", + ) + + + self.protocols = pn.widgets.MultiChoice( + name="Allowed protocols (optional)", + options=["Traveling", "Stationary", "Area", "Incidental", "Historical"], + value=["Traveling", "Stationary", "Area"], + ) + self.chk_exclude_incidental = pn.widgets.Checkbox( + name="Exclude incidental/historical", + value=True, + ) + + self.chk_reviewed = pn.widgets.Checkbox( + name="REVIEWED", + value=False, + ) + + self.chk_approved = pn.widgets.Checkbox( + name="APPROVED", + value=False, + ) + + self.chk_all_species_reported = pn.widgets.Checkbox( + name="ALL SPECIES REPORTED", + value=False, + ) + + self.duration_min = pn.widgets.IntInput( + name="Min duration (minutes)", + value=0, + start=0, + ) + self.duration_max = pn.widgets.IntInput( + name="Max duration (minutes)", + value=600, + start=0, + ) + self.distance_min = pn.widgets.FloatInput( + name="Min distance (km)", + value=0.0, + start=0.0, + step=0.1, + ) + self.distance_max = pn.widgets.FloatInput( + name="Max distance (km)", + value=50.0, + start=0.0, + step=0.1, + ) + self.chk_require_valid_coords = pn.widgets.Checkbox( + name="Require valid coordinates", + value=True, + ) + self.max_count_clip = pn.widgets.IntInput( + name="Clip extreme counts above (0=off)", + value=0, + start=0, + ) + + today = dt.date.today() + self.date_start = pn.widgets.DatePicker( + name="Start date", + value=today - dt.timedelta(days=30), + ) + self.date_end = pn.widgets.DatePicker( + name="End date", + value=today, + ) + self.aggregation_days = pn.widgets.IntInput( + name="Aggregation step (days)", + value=7, + start=1, + ) + + self.grid_step_deg = pn.widgets.FloatInput( + name="Grid step (degrees, 0 = use original coordinates)", + value=0.0, + start=0.0, + step=0.1, + ) + + self.min_reporting_rate = pn.widgets.FloatInput( + name="Min frequency of detection (reporting_rate)", + value=0.0, + start=0.0, + step=0.01, + ) + self.min_count_per_complete_checklist = pn.widgets.FloatInput( + name="Min effort-standardized count", + value=0.0, + start=0.0, + step=0.1, + ) + self.min_sampling_support = pn.widgets.IntInput( + name="Min sampling support (n_complete_checklists)", + value=0, + start=0, + ) + + self.output_dir = pn.widgets.TextInput( + name="Output folder", + value=str(Path.home() / "Downloads"), + ) + self.run_name = pn.widgets.TextInput( + name="Run name", + value="presence_run", + ) + + # output filename for "tracks" export (used only to name the exported file in UI) + self.output_filename = pn.widgets.TextInput( + name="Output filename", + value="presence_points.csv", + placeholder="presence_points.csv", + ) + + self.btn_aggregate = pn.widgets.Button( + name="Aggregate", + button_type="primary", + ) + self.btn_export_tracks = pn.widgets.Button( + name="Export file for ECODATA-Animate", + button_type="primary", + icon="download", + ) + + self.species_select = pn.widgets.MultiChoice( + name="Species in results", + options=[], + value=[], + ) + + self.status = pn.pane.Alert("Ready.", alert_type="success") + self.log = pn.pane.Markdown("### Log\n", sizing_mode="stretch_both") + self.outputs_view = pn.pane.Markdown("### Outputs\nNo outputs yet.", sizing_mode="stretch_width") + + self.spatial_filter_mode.param.watch(self._on_spatial_mode_changed, "value") + + self.btn_aggregate.on_click(self._on_aggregate_clicked) + self.btn_export_tracks.on_click(self._on_export_tracks_clicked) + + # Rebuild layout (controls column vs results column) + self.sidebar = pn.Spacer(height=0) + self.main = pn.Column(self._build_main(), sizing_mode="stretch_both") + + + def _append_log(self, msg: str) -> None: + """Append log line with timestamp.""" + ts = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + self.log.object += f"\n- `{ts}` {msg}" + + def _set_status(self, msg: str, kind: str = "info") -> None: + """Set status alert message.""" + self.status.object = msg + self.status.alert_type = kind + + def _compute_paths(self) -> OutputPaths: + """Compute output paths from output_dir and run_name.""" + out_dir = _ensure_dir(self.output_dir.value) + run = _safe_filename(self.run_name.value, default="presence_run") + return OutputPaths( + out_dir=out_dir, + agg_counts_csv=os.path.join(out_dir, f"{run}__agg_counts.csv"), + agg_presence_csv=os.path.join(out_dir, f"{run}__agg_presence.csv"), + tracks_csv=os.path.join(out_dir, f"{run}__presence_points.csv"), + manifest_json=os.path.join(out_dir, f"{run}__manifest.json"), + ) + + def _format_coord_token(self, value: float) -> str: + """ + Format coordinate for safe use in region_id / filenames. + Example: -88.4667 -> m88p4667 + """ + s = f"{float(value):.4f}" + s = s.replace("-", "m").replace(".", "p") + return s + + + def _build_region_id( + self, + *, + bbox: tuple[float, float, float, float] | None, + polygon_filename_hint: str = "", + ) -> str: + """ + Build region_id from bbox coordinates or polygon filename. + """ + if bbox is not None: + west, south, east, north = bbox + return ( + "bbox_" + f"{self._format_coord_token(west)}_" + f"{self._format_coord_token(south)}_" + f"{self._format_coord_token(east)}_" + f"{self._format_coord_token(north)}" + ) + + name = os.path.basename(polygon_filename_hint or "").strip() + if name: + stem = os.path.splitext(name)[0] + safe = "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem) + safe = safe.strip("_") + if safe: + return f"poly_{safe}" + + return "poly_region" + + def _resolve_table_source(self, kind: str) -> tuple[str, str]: + """Resolve EBD or Sampling input as a local filesystem path.""" + kind = str(kind).strip().lower() + if kind not in {"ebd", "sampling"}: + raise ValueError(f"Unknown table kind: {kind}") + + selector = self.ebd_path if kind == "ebd" else self.sampling_path + label = "EBD" if kind == "ebd" else "Sampling" + + path = str(selector.value or "").strip() + if not path: + raise ValueError(f"Select {label} file with the file selector.") + if not os.path.exists(path): + raise ValueError(f"{label} path does not exist: {path}") + if not os.path.isfile(path): + raise ValueError(f"{label} path is not a file: {path}") + + return path, os.path.basename(path) + + def _resolve_polygon_source(self) -> tuple[str, str]: + """Resolve polygon input as a local filesystem path.""" + path = str(self.polygon_path.value or "").strip() + if not path: + raise ValueError("Select a polygon file with the file selector.") + if not os.path.exists(path): + raise ValueError(f"Polygon path does not exist: {path}") + if not os.path.isfile(path): + raise ValueError(f"Polygon path is not a file: {path}") + return path, os.path.basename(path) + + def _on_spatial_mode_changed(self, event) -> None: + """Refresh UI when spatial filter mode changes.""" + self.main[:] = [self._build_main()] + + def _build_sidebar(self) -> pn.Column: + """Build controls (left column).""" + + if self.spatial_filter_mode.value == "Region polygon": + spatial_controls = pn.Column( + self.spatial_filter_mode, + self.polygon_path, + sizing_mode="stretch_width", + ) + else: + spatial_controls = pn.Column( + self.spatial_filter_mode, + pn.Row( + self.bbox_west, + self.bbox_south, + self.bbox_east, + self.bbox_north, + sizing_mode="stretch_width", + ), + self.bbox_help, + sizing_mode="stretch_width", + ) + + io_box = pn.Column( + pn.pane.Markdown("#### 1. Inputs"), + pn.Row( + pn.Column( + pn.pane.Markdown("**EBD file**"), + self.ebd_path, + sizing_mode="stretch_width", + ), + + pn.Column( + pn.pane.Markdown("**Sampling Event file**"), + self.sampling_path, + sizing_mode="stretch_width", + ), + + sizing_mode="stretch_width", + ), + + pn.layout.Divider(), + + pn.pane.Markdown("#### 2. Spatial subset"), + spatial_controls, + + pn.layout.Divider(), + + pn.pane.Markdown("#### 3. Outputs"), + pn.Row( + self.output_dir, + self.run_name, + sizing_mode="stretch_width", + ), + sizing_mode="stretch_width", + ) + + vet_box = pn.Column( + pn.pane.Markdown("#### 4. Vetting / filtering"), + + # 1) all checkboxes in one row + pn.Row( + self.chk_reviewed, + self.chk_approved, + self.chk_all_species_reported, + self.chk_exclude_incidental, + self.chk_require_valid_coords, + sizing_mode="stretch_width", + ), + + # protocols + max count clip + pn.Row( + self.protocols, + self.max_count_clip, + sizing_mode="stretch_width", + ), + + # Min / Max duration in one row + pn.Row( + self.duration_min, + self.duration_max, + sizing_mode="stretch_width", + ), + + # Min / Max distance in next row + pn.Row( + self.distance_min, + self.distance_max, + sizing_mode="stretch_width", + ), + + sizing_mode="stretch_width", + ) + + + time_box = pn.Column( + pn.pane.Markdown("#### 5. Time and spatial aggregation"), + pn.Row( + self.date_start, + self.date_end, + self.aggregation_days, + self.grid_step_deg, + sizing_mode="stretch_width", + ), + pn.layout.Divider(), + pn.pane.Markdown("#### 6. Derived-metric filters"), + pn.Row( + self.min_reporting_rate, + self.min_count_per_complete_checklist, + self.min_sampling_support, + sizing_mode="stretch_width", + ), + sizing_mode="stretch_width", + ) + + actions = pn.Column( + pn.pane.Markdown("#### 7. Actions"), + pn.Row(self.btn_aggregate, sizing_mode="stretch_width"), + pn.layout.Divider(), + # 6) Species before export + add output filename in same row + pn.Row( + self.species_select, + self.output_filename, + self.btn_export_tracks, + sizing_mode="stretch_width", + ), + sizing_mode="stretch_width", + ) + + return pn.Column(io_box, vet_box, time_box, actions, sizing_mode="stretch_width") + + + def _build_main(self) -> pn.Row: + """Build 2-column layout: controls (wider) + outputs/log (narrower).""" + + controls = pn.Column( + pn.pane.Markdown("## Animal presence data preparation (eBird-compatible format)"), + self._build_sidebar(), + sizing_mode="stretch_both", + styles={"flex": "2"}, # 1) first column wider + ) + + results = pn.Column( + self.status, + self.outputs_view, + pn.layout.Divider(), + self.log, + sizing_mode="stretch_both", + styles={"flex": "1"}, # second column narrower + ) + + return pn.Row(controls, results, sizing_mode="stretch_both") + + def _apply_metric_filters_to_counts(self, counts_csv: str) -> List[str]: + """ + Apply derived-metric filters to aggregated counts CSV in place. + + Returns: + - updated species list after filtering + """ + if not counts_csv or not os.path.exists(counts_csv): + return [] + + df = pd.read_csv(counts_csv) + + if "reporting_rate" in df.columns: + df = df[df["reporting_rate"].fillna(-np.inf) >= float(self.min_reporting_rate.value or 0.0)] + + if "count_per_complete_checklist" in df.columns: + df = df[ + df["count_per_complete_checklist"].fillna(-np.inf) + >= float(self.min_count_per_complete_checklist.value or 0.0) + ] + + if "n_complete_checklists" in df.columns: + df = df[df["n_complete_checklists"].fillna(0) >= int(self.min_sampling_support.value or 0)] + + df.to_csv(counts_csv, index=False, encoding="utf-8") + return sorted(df["species"].dropna().astype(str).unique().tolist()) + + def _on_aggregate_clicked(self, _event) -> None: + """Run backend aggregation and update species list.""" + + try: + ebd_source, ebd_name = self._resolve_table_source("ebd") + sampling_source, sampling_name = self._resolve_table_source("sampling") + except Exception as e: + self._set_status(str(e), "danger") + self._append_log(f"Aggregation blocked: {e}") + return + + polygon_source = None + polygon_filename_hint = "" + bbox = None + + if self.spatial_filter_mode.value == "Region polygon": + try: + polygon_source, polygon_filename_hint = self._resolve_polygon_source() + except Exception as e: + self._set_status(str(e), "danger") + self._append_log(f"Aggregation blocked: {e}") + return + else: + bbox_values = [ + self.bbox_west.value, + self.bbox_south.value, + self.bbox_east.value, + self.bbox_north.value, + ] + if any(v is None for v in bbox_values): + self._set_status("Fill all four bbox coordinates.", "danger") + self._append_log("Aggregation blocked: incomplete bbox.") + return + bbox = tuple(float(v) for v in bbox_values) + region_id = self._build_region_id( + bbox=bbox, + polygon_filename_hint=polygon_filename_hint, + ) + start = self.date_start.value + end = self.date_end.value + if not start or not end or end < start: + self._set_status("Check start/end dates.", "danger") + self._append_log("Aggregation blocked: invalid dates.") + return + step_days = int(self.aggregation_days.value or 0) + if step_days < 1: + self._set_status("Aggregation step (days) must be >= 1.", "danger") + self._append_log("Aggregation blocked: invalid aggregation_days.") + return + grid_step_deg = float(self.grid_step_deg.value or 0.0) + if grid_step_deg < 0: + self._set_status("Grid step (degrees) must be >= 0.", "danger") + self._append_log("Aggregation blocked: invalid grid_step_deg.") + return + + self._paths = self._compute_paths() + + vet = VettingOptions( + require_reviewed=bool(self.chk_reviewed.value), + require_approved=bool(self.chk_approved.value), + require_all_species_reported=bool(self.chk_all_species_reported.value), + allowed_protocols=list(self.protocols.value) if self.protocols.value else None, + exclude_incidental_historical=bool(self.chk_exclude_incidental.value), + duration_min_minutes=int(self.duration_min.value or 0), + duration_max_minutes=int(self.duration_max.value or 600), + distance_min_km=float(self.distance_min.value or 0.0), + distance_max_km=float(self.distance_max.value or 50.0), + require_valid_coords=bool(self.chk_require_valid_coords.value), + clip_counts_above=int(self.max_count_clip.value or 0), + ) + + agg = AggregationOptions( + start_date=start, + end_date=end, + step_days=step_days, + grid_step_deg=grid_step_deg, + treat_x_as_one=True, + ) + + try: + self._set_status("Aggregating…", "warning") + self._append_log("Aggregation started.") + self._append_log(f"EBD source: local path -> {ebd_source}") + self._append_log(f"Sampling source: local path -> {sampling_source}") + + if bbox is not None: + self._append_log( + f"Using bbox: west={bbox[0]}, south={bbox[1]}, east={bbox[2]}, north={bbox[3]}." + ) + else: + self._append_log( + f"Using polygon: {polygon_filename_hint or '[unknown name]'}." + ) + self._append_log(f"Region ID: {region_id}") + self._append_log(f"Aggregation step: {step_days} day(s).") + if grid_step_deg > 0: + self._append_log(f"Grid aggregation enabled: {grid_step_deg} degree(s).") + else: + self._append_log("Grid aggregation disabled: using original observation coordinates.") + + self._append_log( + "Metric filters: " + f"reporting_rate >= {float(self.min_reporting_rate.value or 0.0)}, " + f"count_per_complete_checklist >= {float(self.min_count_per_complete_checklist.value or 0.0)}, " + f"n_complete_checklists >= {int(self.min_sampling_support.value or 0)}." + ) + + species_all = aggregate_ebird_to_files( + ebd_bytes=ebd_source, + sampling_bytes=sampling_source, + polygon_bytes=polygon_source, + polygon_filename_hint=polygon_filename_hint, + bbox=bbox, + ebd_filename_hint=ebd_name or "ebd", + sampling_filename_hint=sampling_name or "sampling", + region_id=region_id, + agg=agg, + vet=vet, + out_counts_csv=self._paths.agg_counts_csv, + out_presence_csv=self._paths.agg_presence_csv, + manifest_json=self._paths.manifest_json, + ) + + species = self._apply_metric_filters_to_counts(self._paths.agg_counts_csv) + + self.species_select.options = species + self.species_select.value = [] + + self._set_status("Aggregation complete.", "success") + self._append_log(f"Created: {self._paths.agg_counts_csv}") + self._append_log(f"Created: {self._paths.agg_presence_csv}") + + self.outputs_view.object = ( + "### Outputs\n" + f"- **Aggregated counts (A)**: `{self._paths.agg_counts_csv}`\n" + f"- **Presence/absence (B)**: `{self._paths.agg_presence_csv}`\n" + f"- **Manifest**: `{self._paths.manifest_json}`\n" + ) + + except Exception as e: + self._set_status(f"Aggregation failed: {e}", "danger") + self._append_log(f"Aggregation failed: {e}") + + def _on_export_tracks_clicked(self, _event) -> None: + """Export Movebank-like pseudo-tracks CSV from aggregated counts.""" + if not self._paths: + self._paths = self._compute_paths() + bbox = None + polygon_filename_hint = "" + + if self.spatial_filter_mode.value == "Region polygon": + polygon_path = str(self.polygon_path.value or "").strip() + polygon_filename_hint = os.path.basename(polygon_path) if polygon_path else "" + else: + bbox_values = [ + self.bbox_west.value, + self.bbox_south.value, + self.bbox_east.value, + self.bbox_north.value, + ] + if not any(v is None for v in bbox_values): + bbox = tuple(float(v) for v in bbox_values) + + region_id = self._build_region_id( + bbox=bbox, + polygon_filename_hint=polygon_filename_hint, + ) + self._append_log(f"Export region ID: {region_id}") + try: + export_tracks_from_aggregated_counts( + agg_counts_csv=self._paths.agg_counts_csv, + tracks_csv=self._paths.tracks_csv, + region_id=region_id, + id_mode="species", + species_filter=list(self.species_select.value) if self.species_select.value else None, + ) + + sp = read_species_from_agg_counts(self._paths.agg_counts_csv) + self.species_select.options = sp + + self._set_status("Export complete.", "success") + self._append_log(f"Created: {self._paths.tracks_csv}") + + self.outputs_view.object = ( + (self.outputs_view.object or "### Outputs\n") + + f"\n- **presence_points.csv (for Animate)**: `{self._paths.tracks_csv}`\n" + ) + + except Exception as e: + self._set_status(f"Export failed: {e}", "danger") + self._append_log(f"Export failed: {e}") + + +@register_view(ext_args=["floatpanel"]) +def view(): + """Create a fresh app instance and return a template for ECODATA routing.""" + app = EbirdPrepareApp() + template = DEFAULT_TEMPLATE( + main=[app.main], + sidebar=[], + ) + return template + + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/movebank_functions.py b/ecodata/movebank_functions.py new file mode 100644 index 0000000..a0e5cff --- /dev/null +++ b/ecodata/movebank_functions.py @@ -0,0 +1,995 @@ +""" +movebank_functions.py + +Processes Movebank CSV datasets using "timestamp" or "eobs:start-timestamp" column, +filter data by "individual-taxon-canonical-name" and "individual-local-identifier". + +Interpolation is always performed first using a 1-minute interval. This produces a regularly spaced time series. + +After interpolation, optional averaging is performed over a user-defined interval (e.g. 30 minutes). +Only numeric columns (such as 'eobs:temperature', 'ground-speed', 'height-above-ellipsoid') are averaged. + +All non-numeric columns (e.g. metadata or identifiers) are forward-filled from the last known value +during interpolation and retained without modification during averaging. +""" + +import csv +from datetime import datetime, timedelta +import pandas as pd +from pathlib import Path +import numpy as np +import re + +TIME_COLUMN = 'timestamp' # Set to "eobs:start-timestamp" or "timestamp" as needed + +def parse_timestamp(s: str) -> datetime: + """ + Robust timestamp parser: + - Keeps backward compatibility with ISO-like strings: 'YYYY-MM-DD HH:MM:SS[.ffffff]' + - Supports day-first formats: 'DD.MM.YYYY HH:MM', 'DD.MM.YYYY HH:MM:SS[.ffffff]' + - Accepts 'T' separator and 'Z' / timezone offsets (drops tzinfo → naive) + - Pads/truncates fractional seconds to 6 digits when present + """ + if s is None: + raise ValueError("Timestamp is None") + + s = str(s).strip() + if not s: + raise ValueError("Empty timestamp") + + # --- Fast path: ISO with optional 'Z' or offset --- + # Example: 2020-01-02T03:04:05.123Z, 2020-01-02 03:04:05.123456+02:00 + iso_candidate = s + if iso_candidate.endswith("Z"): + iso_candidate = iso_candidate[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(iso_candidate.replace("T", " ")) + # Drop tzinfo to keep backward-compatible naive datetimes + if dt.tzinfo is not None: + dt = dt.replace(tzinfo=None) + return dt + except Exception: + pass + + # --- Normalize fractional seconds to <= 6 digits (microseconds) --- + # Works for both 'YYYY-MM-DD ...' and 'DD.MM.YYYY ...' + def _normalize_frac(text: str) -> str: + # split timezone if any to avoid touching the offset part + tz_match = re.search(r'([+-]\d{2}:\d{2}|[+-]\d{4})$', text) + tz = tz_match.group(0) if tz_match else "" + core = text[: -len(tz)] if tz else text + + if '.' in core: + head, frac = core.split('.', 1) + # cut off any trailing timezone-like part accidentally captured + frac = re.split(r'([+-]\d{2}:\d{2}|[+-]\d{4})', frac)[0] + frac = re.sub(r'\D', '', frac) # keep only digits + if len(frac) > 6: + frac = frac[:6] + elif 0 < len(frac) < 6: + frac = frac.ljust(6, '0') + core = f"{head}.{frac}" + return core + tz + + s_norm = _normalize_frac(s) + + # --- Try explicit known formats (old + new) --- + fmts = [ + # ISO-like (kept first for compatibility) + "%Y-%m-%d %H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + # day-first variants + "%d.%m.%Y %H:%M:%S.%f", + "%d.%m.%Y %H:%M:%S", + "%d.%m.%Y %H:%M", + # allow 'T' separator explicitly + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M", + ] + for fmt in fmts: + try: + return datetime.strptime(s_norm, fmt) + except Exception: + continue + + # --- Last resort: pandas inference (dayfirst=True) --- + dt = pd.to_datetime(s_norm, dayfirst=True, errors="coerce", utc=False) + if pd.isna(dt): + raise ValueError(f"Unparsable timestamp: {s}") + # Convert pandas Timestamp to naive datetime (drop tz if any) + py_dt = dt.to_pydatetime() + if hasattr(py_dt, "tzinfo") and py_dt.tzinfo is not None: + py_dt = py_dt.replace(tzinfo=None) + return py_dt + + +def safe_float(value): + """Safely converts a value to float. + Handles None, empty strings, and strips whitespace. + + Args: + value (str or float): Input value. + + Returns: + float or None: Parsed float or None if conversion fails. + """ + if isinstance(value, float) or value is None: + return value + try: + return float(value.strip()) if value.strip() else None + except ValueError: + return None + +# --- Interpolation --- +def interpolate_points(start, end, interval, columns_to_interpolate): + """Generates linearly interpolated points between two observations. + + Args: + start (dict): The first observation row. + end (dict): The second observation row. + interval (timedelta): Interval at which to interpolate (e.g. 1 minute). + columns_to_interpolate (list): List of column names to interpolate. + + Returns: + list: A list of interpolated rows (dicts) between start and end. + """ + + start_time = parse_timestamp(start["timestamp"]) + end_time = parse_timestamp(end["timestamp"]) + + if start_time >= end_time: + return [] + + total_seconds = (end_time - start_time).total_seconds() + step_seconds = interval.total_seconds() + steps = int(total_seconds // step_seconds) + + if steps < 1: + return [] + + timestamps = [ + (start_time + timedelta(seconds=i * step_seconds)).strftime("%Y-%m-%d %H:%M:%S.%f")[:23] + for i in range(1, steps + 1) + ] + + alphas = np.linspace(1 / steps, 1.0, num=steps) + + interpolated_rows = [] + for idx, alpha in enumerate(alphas): + point = dict(start) + point["timestamp"] = timestamps[idx] + for col in columns_to_interpolate: + v_start = safe_float(start.get(col)) + v_end = safe_float(end.get(col)) + if v_start is not None and v_end is not None: + point[col] = v_start + alpha * (v_end - v_start) + else: + point[col] = None + interpolated_rows.append(point) + + return interpolated_rows + +# --- Fill Missing Data --- +def fill_missing_data(data): + """Fill missing lon/lat via linear interpolation between bounding points. + Uses actual lon/lat column names resolved from the data header. + Writes results back into the *same* lon/lat columns. + """ + if not data: + return data + + # derive header from the first row and resolve actual lon/lat keys + fieldnames = list(data[0].keys()) + lon_key, lat_key = resolve_lon_lat_keys(fieldnames) + id_key_in = resolve_id_key(fieldnames) + # if cannot resolve — nothing to do safely + if not lon_key or not lat_key: + return data + + i = 0 + while i < len(data): + # seek a block of rows where either lon or lat is missing + if data[i].get(lon_key) is None or data[i].get(lat_key) is None: + start_idx = i - 1 + while i < len(data) and (data[i].get(lon_key) is None or data[i].get(lat_key) is None): + i += 1 + end_idx = i + + # interpolate only if both ends exist + if 0 <= start_idx < len(data) and end_idx < len(data): + start = data[start_idx] + end = data[end_idx] + start_time = parse_timestamp(start["timestamp"]) + end_time = parse_timestamp(end["timestamp"]) + total_seconds = (end_time - start_time).total_seconds() or 0.0 + if total_seconds <= 0: + continue + + for j in range(start_idx + 1, end_idx): + current_time = parse_timestamp(data[j]["timestamp"]) + alpha = (current_time - start_time).total_seconds() / total_seconds + if start.get(lon_key) is not None and end.get(lon_key) is not None: + data[j][lon_key] = start[lon_key] + alpha * (end[lon_key] - start[lon_key]) + if start.get(lat_key) is not None and end.get(lat_key) is not None: + data[j][lat_key] = start[lat_key] + alpha * (end[lat_key] - start[lat_key]) + else: + i += 1 + + return data + +# --- Averaging --- +def average_by_time_interval(data, interval, columns_to_interpolate, actual_start_time, actual_end_time, allow_single=True): + """Averages numeric values over fixed time intervals. + + Args: + data (list of dict): Interpolated time series. + interval (timedelta): Averaging interval (e.g. 30 minutes). + columns_to_interpolate (list): Numeric columns to average. + actual_start_time (datetime): Start of valid time window. + actual_end_time (datetime): End of valid time window. + allow_single (bool): Whether to keep single-record intervals. + + Returns: + list of dict: Averaged records by time interval. + """ + + if not data: + return [] + + df = pd.DataFrame(data).copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce') + df = df.dropna(subset=["timestamp"]) + df = df.sort_values("timestamp") + + interval_minutes = int(interval.total_seconds() // 60) + df["interval_start"] = df["timestamp"].dt.floor(f"{interval_minutes}T") + grouped = df.groupby("interval_start") + + numeric_cols = [] + for col in columns_to_interpolate: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce') + numeric_cols.append(col) + + result = grouped[numeric_cols].mean() if numeric_cols else pd.DataFrame(index=grouped.size().index) + metadata_cols = [col for col in df.columns if col not in numeric_cols + ["timestamp", "interval_start"]] + for col in metadata_cols: + result[col] = grouped[col].first() + + result = result.reset_index() + result = result.rename(columns={"interval_start": "timestamp"}) + + if not allow_single: + group_sizes = grouped.size() + valid_groups = group_sizes[group_sizes > 1].index + result = result[result["timestamp"].isin(valid_groups)] + + result["timestamp"] = result["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S.%f").str[:23] + return result.to_dict(orient="records") + +# --- Validation --- +def validate_and_process_csv(file_path): + """ + Inspect a Movebank CSV header and return a list of ORIGINAL column names + that are suitable for interpolation/averaging. + + - Robust to header variations: '-', '_', '.', ':' are treated equally. + - Picks synonyms for lon/lat and common numeric fields (e.g., eobs:temperature). + - Time/ID columns are detected but EXCLUDED from the returned list. + - Returns ORIGINAL header names (exactly as in the file). + + Returns + ------- + list[str] + Ordered list of present columns to be used as numeric candidates for + interpolation/averaging (e.g., ['location_lon', 'location_lat', 'eobs:temperature', ...]). + """ + + def _norm(s: str) -> str: + # normalize header keys: "EOBS:Temperature" -> "eobs_temperature" + return re.sub(r"[-:.\s]+", "_", str(s).lower()).strip("_") + + # 1) read header + try: + with open(Path(file_path), "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + raw_fields = reader.fieldnames or [] + except Exception as e: + print(f"[validate_and_process_csv] Failed to read header: {e}") + return [] + + if not raw_fields: + return [] + + # 2) build normalized->original map + norm_to_orig = {} + for col in raw_fields: + nk = _norm(col) + # keep the first occurrence to preserve a stable, human header where possible + if nk not in norm_to_orig: + norm_to_orig[nk] = col + + present = set(norm_to_orig.keys()) + + # 3) define synonym groups + time_syns = [ + "timestamp", "eobs_start_timestamp", "eobs:start-timestamp", + "datetime", "date_time", "date", "time" + ] + id_syns = [ + "individual_local_identifier", "individual-local-identifier" + ] + lon_syns = [ + "location_long", "location_lon", "location-long", "location-lon", + "longitude", "lon", "location_longitude", "location.longitude" + ] + lat_syns = [ + "location_lat", "location-lat", + "latitude", "lat", "location_latitude", "location.latitude" + ] + # common numeric fields you typically interpolate/average + temp_syns = ["eobs_temperature", "eobs:temperature", "temperature"] + gspeed_syns = ["ground_speed", "ground-speed", "speed_2d", "speed"] + hae_syns = [ + "height_above_ellipsoid", "height-above-ellipsoid", + "gps_altitude", "altitude", "altitude_above_sea_level" + ] + + def _pick_first(syns): + for s in syns: + nk = _norm(s) + if nk in present: + return norm_to_orig[nk] + return None + + # 4) choose actual originals (if present) + time_col = _pick_first(time_syns) # not returned, for info/exclusion only + id_col = _pick_first(id_syns) # not returned + + lon_col = _pick_first(lon_syns) + lat_col = _pick_first(lat_syns) + temp_col = _pick_first(temp_syns) + gs_col = _pick_first(gspeed_syns) + hae_col = _pick_first(hae_syns) + + # 5) build the result list (keep a sensible order: coords first) + result = [] + for c in (lon_col, lat_col, temp_col, gs_col, hae_col): + if c and c not in result: + result.append(c) + + # You may also include any additional numeric columns here if you wish: + # e.g., any column whose normalized name starts with "eobs_" and is present. + # Just make sure to exclude time/id-like names: + time_like = {_norm(x) for x in time_syns} + id_like = {_norm(x) for x in id_syns} + + for nk, orig in norm_to_orig.items(): + if nk in time_like or nk in id_like: + continue + # already included? + if orig in result: + continue + # optional heuristic: include other eobs:* numeric-looking fields + if nk.startswith("eobs_"): + result.append(orig) + + return result + +# --- Main Processing --- +def process_csv_interp_or_averaging(start_time_str, end_time_str, interval_minutes, + csv_file, output_csv, local_identifier, + columns_to_interpolate=None, allow_single=True, + deployment_time_gap=60, min_expected_obs=1, + start_from_midnight=False): + """Processes a single individual's movement data with interpolation and optional averaging. + Includes filtering by time and ID, interpolation, averaging, session splitting, and final cleanup. + + Args: + start_time_str (str): Start datetime string. + end_time_str (str): End datetime string. + interval_minutes (int): Time step for averaging. + csv_file (Path): Path to input Movebank CSV. + output_csv (str): Output file path template. + local_identifier (str): Individual ID to process. + columns_to_interpolate (list): Columns to interpolate. + allow_single (bool): Keep intervals with one record. + deployment_time_gap (int): Maximum gap (min) to split sessions. + min_expected_obs (int): Minimum points required to keep a session. + start_from_midnight (bool): If True, truncate to 00:00 and start from it. + + Returns: + list: List of generated CSV file paths. + """ + if columns_to_interpolate is None: + columns_to_interpolate = [] + columns_to_interpolate = [col for col in columns_to_interpolate if col not in ("timestamp", "eobs:start-timestamp")] + + start_time = parse_timestamp(start_time_str) + end_time = parse_timestamp(end_time_str) + interval = timedelta(minutes=interval_minutes) + min_interval = timedelta(minutes=1) + + data = [] + with open(csv_file, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + time_key_in = resolve_time_column(fieldnames) + lon_key, lat_key = resolve_lon_lat_keys(fieldnames) + id_key_in = resolve_id_key(fieldnames) + for row in reader: + try: + ts_raw = row.get(time_key_in) + if not ts_raw: + continue + row_time = parse_timestamp(ts_raw) + except Exception: + continue + + row_id = row.get(id_key_in) if id_key_in else None + if row_id is None: + continue + + row_id_str = str(row_id).strip() + expected_id_str = str(local_identifier).strip() + + if row_id_str != expected_id_str: + continue + + if start_time <= row_time <= end_time: + row["timestamp"] = ts_raw + data.append(row) + + if len(data) < 2: + print("Not enough data after filtering.") + return [] + + data.sort(key=lambda x: parse_timestamp(x["timestamp"])) + + # Cut off at midnight and insert 00:00:00 + if start_from_midnight and data: + first_time = parse_timestamp(data[0]["timestamp"]) + midnight = first_time.replace(hour=0, minute=0, second=0, microsecond=0) + + # Cut off points to 00:00:00 + data = [row for row in data if parse_timestamp(row["timestamp"]) >= midnight] + + # If there is no exact point 00:00:00 — insert an artificial one + if data and parse_timestamp(data[0]["timestamp"]) > midnight: + clone = dict(data[0]) + clone["timestamp"] = midnight.strftime("%Y-%m-%d %H:%M:%S.%f")[:23] + for col in columns_to_interpolate: + if col in clone: + clone[col] = clone[col] # copy the value from the first real point + data.insert(0, clone) + + if len(data) < 2: + print("Not enough data after start_from_midnight filtering.") + return [] + + for col in ["timestamp"] + columns_to_interpolate: + if col not in fieldnames: + fieldnames.append(col) + + data = fill_missing_data(data) + + def split_into_sessions(data, max_gap_minutes): + max_gap = timedelta(minutes=max_gap_minutes) + sessions = [] + current_session = [] + + for i, row in enumerate(data): + if i == 0: + current_session.append(row) + continue + + prev_time = parse_timestamp(data[i-1]['timestamp']) + curr_time = parse_timestamp(row['timestamp']) + if curr_time - prev_time > max_gap: + if current_session: + sessions.append(current_session) + current_session = [row] + else: + current_session.append(row) + + if current_session: + sessions.append(current_session) + return sessions + + sessions = split_into_sessions(data, deployment_time_gap) + result_paths = [] + + for idx, session in enumerate(sessions): + if len(session) < min_expected_obs: + print(f"Skipping session {idx+1} with only {len(session)} observations (less than min_expected_obs={min_expected_obs})") + continue + + interpolated_rows = [] + for i in range(len(session) - 1): + interpolated_rows.append(session[i]) + interpolated_rows.extend(interpolate_points(session[i], session[i + 1], min_interval, columns_to_interpolate)) + if session: + interpolated_rows.append(session[-1]) + + if interval.total_seconds() > 60: + result_rows = average_by_time_interval( + interpolated_rows, interval, columns_to_interpolate, + actual_start_time=parse_timestamp(session[0]['timestamp']), + actual_end_time=parse_timestamp(session[-1]['timestamp']), + allow_single=allow_single + ) + else: + result_rows = interpolated_rows + + start_str = session[0]['timestamp'].replace(":", "-").replace(" ", "T")[:16] + end_str = session[-1]['timestamp'].replace(":", "-").replace(" ", "T")[:16] + session_output_path = output_csv.replace(".csv", f"__{start_str}_to_{end_str}.csv") + + with open(session_output_path, "w", newline='', encoding="utf-8") as f: + if "individual-local-identifier-deployment-time" not in fieldnames: + fieldnames.append("individual-local-identifier-deployment-time") + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for row in result_rows: + ts = row.get("timestamp") + if ts is None: + continue + row["timestamp"] = str(ts)[:23] + row["individual-local-identifier-deployment-time"] = Path(session_output_path).stem + writer.writerow(row) + result_paths.append(session_output_path) + try: + df_check = normalize_column_names(pd.read_csv(session_output_path, low_memory=False)) + ... + if numeric_cols_to_fix: + ... + df_check.to_csv(session_output_path, index=False) + print(f"Interpolated missing values in: {numeric_cols_to_fix} for file {session_output_path}") + except Exception as e: + print(f"Interpolation post-check failed for {session_output_path}: {e}") + + # Checking and interpolating NaN after writing + cols_to_check_for_nan = [ + "timestamp", "location_long", "location_lat", + "eobs_start_timestamp", "eobs_temperature", + "ground_speed", "height_above_ellipsoid" + ] + if result_paths: + last_file = result_paths[-1] + try: + df_check = normalize_column_names(pd.read_csv(last_file, low_memory=False)) + numeric_cols_to_fix = [ + col for col in cols_to_check_for_nan + if col in df_check.columns + and df_check[col].dtype in ["float64", "int64"] + and df_check[col].isna().any() + ] + + if numeric_cols_to_fix: + df_check["timestamp"] = pd.to_datetime(df_check["timestamp"], errors="coerce") + df_check = df_check.set_index("timestamp") + df_check[numeric_cols_to_fix] = df_check[numeric_cols_to_fix].interpolate( + method="time", limit_direction="both" + ) + df_check = df_check.reset_index() + df_check.to_csv(last_file, index=False) + print(f"Interpolated missing values in: {numeric_cols_to_fix} for file {last_file}") + except Exception as e: + print(f"Interpolation post-check failed for {last_file}: {e}") + + return result_paths + +# --- Merging --- +def merge_csv_files_from_folder(folder_path: Path, delete_empty_columns: bool) -> (pd.DataFrame, list): + """Merges multiple CSV files into one DataFrame. + Optionally deletes columns that are not shared across files. + + Args: + folder_path (Path): Directory containing CSV files. + delete_empty_columns (bool): If True, remove non-overlapping columns. + + Returns: + tuple: (merged DataFrame, list of removed column names, list of source CSV file paths) + """ + csv_files = sorted(folder_path.glob("*.csv")) + if not csv_files: + raise ValueError("No CSV files found in the selected folder.") + dataframes = [normalize_column_names(pd.read_csv(f)) for f in csv_files] + all_columns = set() + for df in dataframes: + all_columns.update(df.columns) + missing_columns = {col for col in all_columns if any(col not in df.columns for df in dataframes)} + if delete_empty_columns and missing_columns: + cleaned_dataframes = [df.drop(columns=list(missing_columns), errors='ignore') for df in dataframes] + merged_df = pd.concat(cleaned_dataframes, ignore_index=True) + else: + merged_df = pd.concat(dataframes, ignore_index=True) + return merged_df, sorted(missing_columns), [str(p) for p in csv_files] + +# --- Filename --- +def safe_filename(name: str, replacement: str = "_") -> str: + """Generates a filesystem-safe filename by replacing invalid characters. + + Args: + name (str): Original filename string. + replacement (str): Replacement for invalid characters. + + Returns: + str: Sanitized filename. + """ + return re.sub(r'[\\/:*?"<>| ]+', replacement, name).strip() + +# --- Batch --- +def generate_individual_csvs_for_local_ids(csv_file: Path, ids: list, + start_time, end_time, interval_minutes: int, + output_path_template: str, columns_to_interpolate: list, + deployment_time_gap: int = 60, + min_expected_obs: int = 100, + start_from_midnight = False) -> list: + """Processes movement data for multiple individuals into separate files. + + Calls process_csv_interp_or_averaging for each ID and aggregates results. + + Args: + csv_file (Path): Input CSV file path. + ids (list of str): List of local identifiers (tags). + start_time (str): Start datetime string. + end_time (str): End datetime string. + interval_minutes (int): Time step in minutes. + output_path_template (str): Base output path for naming files. + columns_to_interpolate (list): Columns for interpolation. + deployment_time_gap (int): Max gap in minutes to split sessions. + min_expected_obs (int): Minimum observations per session. + start_from_midnight (bool): If True, truncate sessions to start at 00:00. + + Returns: + list: List of output file paths. + """ + output_files = [] + for id in ids: + save_name_by_ID = safe_filename(id) + output = output_path_template.replace(".csv", f"_{save_name_by_ID}.csv") + result_paths = process_csv_interp_or_averaging( + start_time_str=start_time, + end_time_str=end_time, + interval_minutes=interval_minutes, + csv_file=csv_file, + output_csv=output, + local_identifier=id, + columns_to_interpolate=columns_to_interpolate, + deployment_time_gap=deployment_time_gap, + min_expected_obs=min_expected_obs, + start_from_midnight=start_from_midnight + ) + output_files.extend(result_paths) + return output_files + +def interpolate_missing_values_only(start_time_str: str, + end_time_str: str, + csv_file: Path, + ids: list, + columns_to_interpolate: list, + output_path_template: str, + max_gap_minutes: int = 24*60) -> list: + """ + Fill-in missing numeric values *within existing rows only* (no new rows created), + using time-based interpolation limited to gaps ≤ `max_gap_minutes` between two + known observations. Interpolation is performed independently per Individual ID. + + This function is designed for the "Simple interpolation (missing ≤ 1 day)" button: + - It does NOT build a regular 1-minute timeline. + - It only fills NaNs that lie strictly between two valid values where the total + time span between those two values is ≤ `max_gap_minutes`. + - It preserves original column names and writes timestamps back into the original + time column (if it exists), otherwise creates one. + + Parameters + ---------- + start_time_str : str + Start of the time window (string; parsed by `parse_timestamp`). + end_time_str : str + End of the time window (string; parsed by `parse_timestamp`). + csv_file : Path + Path to the input Movebank CSV. + ids : list + List of `individual-local-identifier` values to process independently. + columns_to_interpolate : list + Candidate columns for interpolation (original headers as in CSV). + Time-like columns (e.g., 'timestamp', 'eobs:start-timestamp') are ignored. + output_path_template : str + Template for output CSV path; per-ID files are created by appending + `_{safe_id}__interp_inplace_le1d.csv` before the ".csv" suffix. + max_gap_minutes : int, default 24*60 + Maximum allowed gap (in minutes) between two valid values to fill NaNs inside. + + Returns + ------- + list of str + Paths to the created per-ID CSV files. + + Notes + ----- + - Time parsing relies on `parse_timestamp`, which should support both ISO-like + and 'DD.MM.YYYY HH:MM[:SS[.fff]]' formats (and possibly 'T'/'Z'/offsets). + - The function matches the time & ID columns via *normalized* header keys, + but preserves original headers in the written output. + """ + + # --- Helpers (scoped locally to avoid polluting the module namespace) ---------- + def _norm_key(s: str) -> str: + """Normalize a single header key to a canonical form.""" + return re.sub(r"[-:.\s]+", "_", str(s).lower()).strip("_") + + def _norm_keys(d: dict) -> dict: + """Normalize all keys in a row (dict) for robust lookup; values unchanged.""" + return {_norm_key(k): v for k, v in d.items()} + + def _pick_time_col_from_df(df: pd.DataFrame) -> str: + """ + Choose which original column in df should store timestamps in the output. + Preference order: + 1) TIME_COLUMN (global) if present (matching by normalized name), + 2) 'timestamp', + 3) 'eobs_start_timestamp', + 4) 'time', 'datetime', 'date'. + Returns the *original* column name if found; otherwise returns TIME_COLUMN + (creating it later if missing). + """ + # Map normalized -> original + colmap = {_norm_key(c): c for c in df.columns} + + # TIME_COLUMN may be 'timestamp' or 'eobs:start-timestamp', etc. + time_key_norm = _norm_key(TIME_COLUMN) + candidates_norm = [ + time_key_norm, + "timestamp", + "eobs_start_timestamp", + "time", + "datetime", + "date", + ] + for nk in candidates_norm: + if nk in colmap: + return colmap[nk] + # fallback: use the global TIME_COLUMN string as-is + return TIME_COLUMN + + # --- Parse time window --------------------------------------------------------- + start_time = parse_timestamp(start_time_str) + end_time = parse_timestamp(end_time_str) + created_paths: list[str] = [] + + # --- Build a set of time-like normalized names to exclude from interpolation --- + time_like_norm = {"timestamp", "eobs_start_timestamp", "time", "datetime", "date"} + + # Prepare normalized view of the interpolation column list (but we will keep + # original names when writing to CSV) + + for local_id in ids: + rows = [] + dts = [] # parsed datetimes aligned with `rows` + fieldnames = None + + # --- Read only the current ID and time range ------------------------------ + with open(csv_file, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + time_key_in = resolve_time_column(fieldnames) + #lon_key, lat_key = resolve_lon_lat_keys(fieldnames) + for row in reader: + norm = _norm_keys(row) + + # Time value: prefer TIME_COLUMN, then common alternates + time_key_norm = _norm_key(TIME_COLUMN) + ts_str = norm.get(_norm_key(time_key_in)) \ + or norm.get("timestamp") \ + or norm.get("eobs_start_timestamp") \ + or norm.get("time") \ + or norm.get("datetime") \ + or norm.get("date") + if not ts_str: + continue + try: + t = parse_timestamp(ts_str) + except Exception: + # Skip rows with unparsable timestamps + continue + + # ID filter + rid = (norm.get("individual_local_identifier") or "").strip() + if rid != str(local_id).strip(): + continue + + # Time window filter + if not (start_time <= t <= end_time): + continue + + rows.append(row) # keep original headers/values + dts.append(t) + + # If nothing matched for this ID: skip + if not rows: + continue + + # --- Build DataFrame preserving original headers -------------------------- + df = pd.DataFrame(rows) + df["__dt"] = pd.to_datetime(dts) # already parsed, but ensure dtype + df = df.sort_values("__dt").set_index("__dt") + + # --- Interpolate each numeric column within allowed gaps ------------------- + # Keep only columns explicitly requested AND present in df, excluding any time-like + cols_to_fill = [] + for c in (columns_to_interpolate or []): + if c not in df.columns: + continue + if _norm_key(c) in time_like_norm: + continue + cols_to_fill.append(c) + + if cols_to_fill: + idx = df.index + max_gap = pd.Timedelta(minutes=max_gap_minutes) + + for col in cols_to_fill: + # Convert to numeric; non-numeric -> NaN + s = pd.to_numeric(df[col], errors="coerce") + if s.isna().all(): + # Nothing to interpolate in this column + df[col] = s + continue + + # Identify rows that are NaN between two valid values + orig_na = s.isna() + + # Timestamps of previous/next valid values + prev_t = pd.Series(idx.where(s.notna(), pd.NaT), index=idx).ffill() + next_t = pd.Series(idx.where(s.notna(), pd.NaT), index=idx).bfill() + + # Total gap length between surrounding valid values + total_gap = next_t - prev_t + allowed = ( + orig_na + & prev_t.notna() + & next_t.notna() + & (total_gap <= max_gap) + ) + + # Time-based interpolation only *inside* valid spans + s_interp = s.interpolate(method="time", limit_area="inside") + s_filled = s.copy() + s_filled[allowed] = s_interp[allowed] + + df[col] = s_filled + + time_col_out = resolve_time_column(df.columns) + # --- Prepare output: restore a string time column and drop helper ---------- + out = df.reset_index(drop=False) + # format time once + out_ts = out["__dt"].dt.strftime("%Y-%m-%d %H:%M:%S.%f").str[:23] + # write back into BOTH the chosen original time column and the canonical 'timestamp' + out[time_col_out] = out_ts + out["timestamp"] = out_ts + out = out.drop(columns=["__dt"]) + + # --- Write per-ID CSV ------------------------------------------------------ + id_safe = re.sub(r'[\\/:*?"<>| ]+', "_", str(local_id)).strip("_") + out_path = output_path_template.replace(".csv", f"_{id_safe}__interp_inplace_le1d.csv") + out.to_csv(out_path, index=False) + created_paths.append(out_path) + + return created_paths + + +def normalize_column_names(df): + """ + Normalizes DataFrame column names: + - converts to lower-case + - replaces '-', ':', '.', spaces with '_' + - removes extra underscores at the beginning and end + """ + df = df.copy() + df.columns = [ + re.sub(r"[_]+", "_", re.sub(r"[-:.\s]+", "_", str(col).lower())).strip("_") + for col in df.columns + ] + return df + +def resolve_lon_lat_keys(fieldnames): + """ + Resolve actual longitude/latitude column names from a CSV header. + Returns (lon_key, lat_key) as *original* header strings. + Falls back to 'location-long' / 'location-lat' if present. + """ + import re + + def _norm(s: str) -> str: + return re.sub(r"[-:._\s]+", "_", str(s).lower()).strip("_") + + norm_map = {_norm(c): c for c in fieldnames} + + lon_syn = ["location_long", "location_lon", "location-long", "location-lon", + "longitude", "lon", "location_longitude", "location.longitude"] + lat_syn = ["location_lat", "location-lat", + "latitude", "lat", "location_latitude", "location.latitude"] + + lon_key = next((norm_map[_norm(c)] for c in lon_syn if _norm(c) in norm_map), None) + lat_key = next((norm_map[_norm(c)] for c in lat_syn if _norm(c) in norm_map), None) + + # fallback : dash-style names if present + if lon_key is None and "location-long" in fieldnames: + lon_key = "location-long" + if lat_key is None and "location-lat" in fieldnames: + lat_key = "location-lat" + + return lon_key, lat_key + + + +def _norm_key(s: str) -> str: + """Normalize a header key: lower-case and replace - : . space with _.""" + return re.sub(r"[-:._\s]+", "_", str(s).lower()).strip("_") + +def resolve_time_column(fieldnames) -> str: + """ + Pick the ORIGINAL header name that stores timestamps. + Preference order: + 1) TIME_COLUMN (normalized) + 2) 'timestamp' + 3) 'eobs:start-timestamp' / 'eobs_start_timestamp' + 4) 'time', 'datetime', 'date' + Returns: original header name if present; otherwise returns TIME_COLUMN. + """ + # map normalized -> original header + norm_to_orig = {_norm_key(c): c for c in fieldnames} + + candidates = [ + _norm_key(TIME_COLUMN), # whatever the module-level TIME_COLUMN is + "timestamp", + "eobs:start-timestamp", + "eobs_start_timestamp", + "time", "datetime", "date", + ] + for cand in candidates: + nk = _norm_key(cand) + if nk in norm_to_orig: + return norm_to_orig[nk] + return TIME_COLUMN # fallback + +def resolve_id_key(fieldnames) -> str | None: + """ + Return ORIGINAL header name that stores the individual ID. + Supports hyphens/underscores/colons/dots variants. + """ + norm_to_orig = {_norm_key(c): c for c in fieldnames} + candidates = [ + "individual_local_identifier", + "individual-local-identifier", + "individual:local-identifier", + "individual.local.identifier", + ] + for cand in candidates: + nk = _norm_key(cand) + if nk in norm_to_orig: + return norm_to_orig[nk] + return None + +def delete_files(paths: list[str], keep: list[str] | None = None) -> list[str]: + """ + Delete files by absolute/relative paths. + Returns a list of successfully deleted paths. + """ + from pathlib import Path + keep_set = {str(Path(k).resolve()) for k in (keep or [])} + deleted = [] + for p in paths: + try: + rp = str(Path(p).resolve()) + if rp in keep_set: + continue + Path(rp).unlink(missing_ok=True) + deleted.append(rp) + except Exception as e: + print(f"[delete_files] Failed to delete {p}: {e}") + return deleted \ No newline at end of file diff --git a/ecodata/multidim_annotation_func.py b/ecodata/multidim_annotation_func.py new file mode 100644 index 0000000..a07d591 --- /dev/null +++ b/ecodata/multidim_annotation_func.py @@ -0,0 +1,2228 @@ +""" +Multidimensional annotation backend for ECODATA-Prepare. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, Union +import logging +import math +import re + +import numpy as np +import pandas as pd +import xarray as xr + +try: + import geopandas as gpd + from shapely.geometry import Point +except Exception: # pragma: no cover + gpd = None + Point = None + +try: + from ecodata.annotation_eng_func import ( + safe_open_nc_with_time_decoding, + get_nc_bounds, + load_vector_extent_info, + _k_nearest_indices as ae_k_nearest_indices, + _idw as ae_idw, + ) +except Exception: # pragma: no cover + safe_open_nc_with_time_decoding = None + get_nc_bounds = None + load_vector_extent_info = None + ae_k_nearest_indices = None + ae_idw = None + +LOGGER = logging.getLogger(__name__) +G0 = 9.80665 +_GEOID_MODEL = None + +TIME_CANDIDATES = ("time", "valid_time", "forecast_time", "verification_time", "datetime", "date") +LAT_CANDIDATES = ("lat", "latitude", "y") +LON_CANDIDATES = ("lon", "longitude", "long", "x") +LEVEL_CANDIDATES = ("level", "lev", "plev", "pressure", "pressure_level", "isobaricInhPa", "isobaric_in_hPa") + +VerticalMethod = Literal["nearest", "linear"] +HorizontalMethod = Literal["nearest", "idw"] +HeightReference = Literal["already_orthometric", "already_msl", "ellipsoidal", "agl"] +GeoidMode = Literal["none", "constant", "geographiclib", "pyproj_grid"] +VariableType = Literal["continuous", "categorical"] + + +@dataclass +class DatasetSpec: + path: Union[str, Path] + variables: List[str] + continuous: List[str] = field(default_factory=list) + categorical: List[str] = field(default_factory=list) + label_prefix: str = "" + + @classmethod + def from_single(cls, path: Union[str, Path], variable: Optional[str], label_prefix: str = "") -> Optional["DatasetSpec"]: + if not path or not variable: + return None + return cls(path=path, variables=[variable], continuous=[variable], categorical=[], label_prefix=label_prefix) + + +@dataclass +class OptionalComponentSpec: + path: Optional[Union[str, Path]] = None + variable: Optional[str] = None + label: str = "" + + def is_enabled(self) -> bool: + return bool(self.path and self.variable) + + +@dataclass +class MultidimAnnotationConfig: + movement_csv: Union[str, Path] + output_csv: Union[str, Path] + + id_col: str + time_col: str + lat_col: str + lon_col: str + height_col: str + + geopotential_file: Union[str, Path] + geopotential_variable: str + multilevel: DatasetSpec + + selected_ids: Optional[List[str]] = None + boundary_path: Optional[Union[str, Path]] = None + bbox: Optional[Dict[str, float]] = None + + coord_spec: Optional[Dict[str, Optional[str]]] = None + geopotential_units: Optional[str] = "m2 s-2" + convert_geopotential_to_height: bool = True + gravity_constant: float = G0 + + spatial_method: HorizontalMethod = "nearest" + smoothing_k: int = 1 + vertical_method: VerticalMethod = "nearest" + keep_diagnostics: bool = True + save_per_individual: bool = False + + height_reference: HeightReference = "ellipsoidal" + geoid_mode: GeoidMode = "geographiclib" + constant_geoid_undulation_m: float = 0.0 + geoid_grid_path: Optional[Union[str, Path]] = None + + surface: Optional[DatasetSpec] = None + use_surface_as_lower_anchor: bool = True + surface_height_agl_m: float = 2.0 + + dem_file: Optional[Union[str, Path]] = None + dem_units: str = "m" + dem_reference: str = "orthometric" + + u_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + v_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + w_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + temperature_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + + derive_wind_speed_direction: bool = False + derive_wind_support_crosswind: bool = False + derive_vertical_motion: bool = False + derive_thermal_proxy: bool = False + derive_orographic_uplift: bool = False + + heading_col: Optional[str] = None + heading_source: Literal["compute", "column"] = "compute" + + allow_vertical_extrapolation: bool = False + + +def _as_path(path: Union[str, Path]) -> Path: + return Path(path).expanduser().resolve() + + +def _require_file(path: Union[str, Path], label: str) -> Path: + p = _as_path(path) + if not p.exists() or not p.is_file(): + raise FileNotFoundError(f"{label} not found or is not a file: {p}") + return p + + +def _normalise_name(name: str) -> str: + return re.sub(r"[-:.\s]+", "_", str(name).lower()) + + +def _unique(*values: Iterable[str]) -> List[str]: + out: List[str] = [] + seen = set() + for seq in values: + for val in list(seq or []): + if val not in seen: + seen.add(val) + out.append(val) + return out + + +def _normalize_vertical_method(value: str) -> VerticalMethod: + v = str(value or "").strip().lower() + return "linear" if ("linear" in v or "interpol" in v) else "nearest" + + +def _normalize_spatial_method(value: str) -> HorizontalMethod: + v = str(value or "").strip().lower() + if "idw" in v or "inverse" in v: + return "idw" + return "nearest" + + +def _safe_float(value: Any) -> float: + try: + return float(value) + except Exception: + return np.nan + + +def parse_movebank_timestamp_series(series: pd.Series, col_name: str = "timestamp") -> pd.Series: + raw = series.copy() + attempts: List[pd.Series] = [] + + for kwargs in ( + {"errors": "coerce", "utc": False}, + {"errors": "coerce", "utc": False, "dayfirst": True}, + {"errors": "coerce", "utc": False, "format": "mixed"}, + {"errors": "coerce", "utc": False, "format": "ISO8601"}, + ): + try: + attempts.append(pd.to_datetime(raw, **kwargs)) + except Exception: + pass + + out = attempts[0] if attempts else pd.to_datetime(raw, errors="coerce") + for parsed in attempts[1:]: + out = out.fillna(parsed) + + if out.isna().any(): + numeric = pd.to_numeric(raw, errors="coerce") + numeric_attempts = [] + for unit in ("s", "ms", "us", "ns"): + try: + numeric_attempts.append(pd.to_datetime(numeric, errors="coerce", unit=unit, utc=False)) + except Exception: + pass + if numeric_attempts: + best = max(numeric_attempts, key=lambda x: int(x.notna().sum())) + out = out.fillna(best) + + if out.isna().any(): + bad_mask = out.isna() + examples = raw[bad_mask].astype(str).head(10).tolist() + raise ValueError( + f"Timestamp column '{col_name}' contains {int(bad_mask.sum())} unparsable value(s). " + f"Examples: {examples}" + ) + + try: + if getattr(out.dt, "tz", None) is not None: + out = out.dt.tz_convert(None) + except Exception: + pass + return out + + +def _find_name(obj: Union[xr.Dataset, xr.DataArray], candidates: Sequence[str]) -> Optional[str]: + names: List[str] = [] + if isinstance(obj, xr.Dataset): + names.extend([str(x) for x in obj.coords]) + names.extend([str(x) for x in obj.dims]) + names.extend([str(x) for x in obj.variables]) + else: + names.extend([str(x) for x in obj.coords]) + names.extend([str(x) for x in obj.dims]) + lower = {n.lower(): n for n in names} + for cand in candidates: + if cand in names: + return cand + if cand.lower() in lower: + return lower[cand.lower()] + return None + + +def _coord_names(ds: xr.Dataset, coord_spec: Optional[Dict[str, Optional[str]]] = None, require_level: bool = False) -> Dict[str, Optional[str]]: + spec = coord_spec or {} + names = { + "time": spec.get("time") or _find_name(ds, TIME_CANDIDATES), + "lat": spec.get("lat") or _find_name(ds, LAT_CANDIDATES), + "lon": spec.get("lon") or _find_name(ds, LON_CANDIDATES), + "level": spec.get("level") or _find_name(ds, LEVEL_CANDIDATES), + } + for key, val in list(names.items()): + if val and val not in ds.variables and val not in ds.coords and val not in ds.dims: + names[key] = None + missing = [k for k in ("time", "lat", "lon") if names[k] is None] + if require_level and names["level"] is None: + missing.append("level") + if missing: + raise ValueError(f"Dataset is missing required coordinate(s): {', '.join(missing)}") + return names + + +def _rename_standard_coords(ds: xr.Dataset, coord_spec: Optional[Dict[str, Optional[str]]] = None) -> xr.Dataset: + names = _coord_names(ds, coord_spec, require_level=False) + mapping = {} + for std in ("time", "lat", "lon", "level"): + src = names.get(std) + if src and src != std and src in ds.variables: + mapping[src] = std + elif src and src != std and src in ds.dims: + mapping[src] = std + if mapping: + ds = ds.rename(mapping) + if "lat" in ds: + try: + vals = np.asarray(ds["lat"].values, dtype=float) + if vals.ndim == 1 and vals.size > 1 and vals[0] > vals[-1]: + ds = ds.sortby("lat") + except Exception: + pass + if "lon" in ds: + try: + vals = np.asarray(ds["lon"].values, dtype=float) + if vals.ndim == 1 and vals.size > 1 and vals[0] > vals[-1]: + ds = ds.sortby("lon") + except Exception: + pass + return ds + + +def open_dataset(path: Union[str, Path], coord_spec: Optional[Dict[str, Optional[str]]] = None) -> xr.Dataset: + p = _require_file(path, "NetCDF file") + if safe_open_nc_with_time_decoding is not None: + try: + ds = safe_open_nc_with_time_decoding(str(p)) + except Exception: + ds = xr.open_dataset(p, decode_times=True) + else: + try: + ds = xr.open_dataset(p, decode_times=True) + except Exception: + ds = xr.open_dataset(p, decode_times=False) + return _rename_standard_coords(ds, coord_spec) + + +def _wrap_lon(lon: float, lon_values: np.ndarray) -> float: + vals = np.asarray(lon_values, dtype=float) + finite = vals[np.isfinite(vals)] + if finite.size == 0: + return float(lon) + mn, mx = float(np.nanmin(finite)), float(np.nanmax(finite)) + if mn >= 0 and mx > 180 and lon < 0: + return float(lon) % 360.0 + if mn < 0 and mx <= 180 and lon > 180: + return ((float(lon) + 180.0) % 360.0) - 180.0 + return float(lon) + + +def _time_value(t: pd.Timestamp) -> Any: + return np.datetime64(pd.Timestamp(t).to_datetime64()) + + +def _nearest_time_index(values: np.ndarray, t: pd.Timestamp) -> int: + times = pd.to_datetime(values) + arr = times.to_numpy(dtype="datetime64[ns]").astype("int64") + target = np.datetime64(pd.Timestamp(t).to_datetime64()).astype("datetime64[ns]").astype("int64") + return int(np.nanargmin(np.abs(arr - target))) + + +def _select_time_space( + da: xr.DataArray, + t: pd.Timestamp, + lat: float, + lon: float, + *, + time_method: Literal["nearest", "linear"] = "linear", + spatial_method: Literal["nearest", "linear"] = "linear", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> xr.DataArray: + lat_q = float(lat if fixed_lat is None else fixed_lat) + lon_q = float(lon if fixed_lon is None else fixed_lon) + lon_q = _wrap_lon(lon_q, np.asarray(da["lon"].values)) if "lon" in da.coords else lon_q + + out = da + + if spatial_method == "nearest": + isel_indexers: Dict[str, int] = {} + if "lat" in out.dims and "lat" in out.coords: + isel_indexers["lat"] = _nearest_index(np.asarray(out["lat"].values, dtype=float), lat_q) + if "lon" in out.dims and "lon" in out.coords: + isel_indexers["lon"] = _nearest_index(np.asarray(out["lon"].values, dtype=float), lon_q) + if isel_indexers: + out = out.isel(isel_indexers) + else: + spatial_indexers = {} + if "lat" in out.dims or "lat" in out.coords: + spatial_indexers["lat"] = lat_q + if "lon" in out.dims or "lon" in out.coords: + spatial_indexers["lon"] = lon_q + if spatial_indexers: + try: + out = out.interp(spatial_indexers, method="linear") + except Exception: + out = out.sel(spatial_indexers, method="nearest") + + if "time" in out.dims or "time" in out.coords: + if time_method == "nearest": + try: + if "time" in out.dims: + out = out.isel({"time": _nearest_time_index(out["time"].values, pd.Timestamp(t))}) + else: + out = out.sel({"time": _time_value(pd.Timestamp(t))}, method="nearest") + except Exception: + out = out.sel({"time": _time_value(pd.Timestamp(t))}, method="nearest") + else: + try: + out = out.interp({"time": _time_value(pd.Timestamp(t))}, method="linear") + except Exception: + out = out.sel({"time": _time_value(pd.Timestamp(t))}, method="nearest") + + return out.squeeze(drop=True) + + +def geopotential_to_height_m( + da: xr.DataArray, + units_override: Optional[str] = None, + convert_geopotential_to_height: bool = True, + gravity_constant: float = G0, +) -> xr.DataArray: + units = (units_override or da.attrs.get("units") or "").lower() + norm = units.replace("**", "^").replace("/", " ") + is_height = norm.strip() in {"m", "meter", "meters", "metre", "metres"} or "geopotential metre" in norm or "gpm" in norm + is_geopotential = any(x in norm for x in ("m^2 s^-2", "m2 s-2", "m2 s^-2", "m2 s**-2")) + if convert_geopotential_to_height and (is_geopotential or not is_height): + out = da / float(gravity_constant) + out.attrs["units"] = "m" + return out.rename("geopotential_height_m") + out = da.copy() + out.attrs["units"] = "m" + return out.rename("geopotential_height_m") + + +def sample_level_profile( + ds: xr.Dataset, + variable: str, + *, + t: pd.Timestamp, + lat: float, + lon: float, + time_method: Literal["nearest", "linear"] = "linear", + spatial_method: Literal["nearest", "linear"] = "linear", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Tuple[np.ndarray, np.ndarray]: + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in dataset.") + if "level" not in ds[variable].dims and "level" not in ds[variable].coords: + raise ValueError(f"Variable '{variable}' has no level dimension.") + prof = _select_time_space( + ds[variable], t, lat, lon, + time_method=time_method, + spatial_method=spatial_method, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + if "level" not in prof.dims and "level" in prof.coords: + prof = prof.expand_dims({"level": prof["level"]}) + prof = prof.transpose("level", ...).squeeze(drop=True) + levels = np.asarray(prof["level"].values) + values = np.asarray(prof.values).astype(float).reshape(-1) + return levels, values + + +def sample_geopotential_profile( + ds: xr.Dataset, + variable: str, + *, + t: pd.Timestamp, + lat: float, + lon: float, + units_override: Optional[str], + convert_geopotential_to_height: bool, + gravity_constant: float, + time_method: Literal["nearest", "linear"] = "linear", + spatial_method: Literal["nearest", "linear"] = "linear", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Tuple[np.ndarray, np.ndarray]: + z = geopotential_to_height_m(ds[variable], units_override, convert_geopotential_to_height, gravity_constant) + tmp = z.to_dataset(name="geopotential_height_m") + return sample_level_profile( + tmp, "geopotential_height_m", t=t, lat=lat, lon=lon, + time_method=time_method, spatial_method=spatial_method, + fixed_lat=fixed_lat, fixed_lon=fixed_lon, + ) + + +def sample_surface_value( + ds: xr.Dataset, + variable: str, + *, + t: pd.Timestamp, + lat: float, + lon: float, + variable_type: VariableType = "continuous", + spatial_method: HorizontalMethod = "nearest", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Any: + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in surface dataset.") + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + spatial_xr: Literal["nearest", "linear"] = "nearest" if (variable_type == "categorical" or spatial_method == "nearest") else "linear" + da = _select_time_space( + ds[variable], t, lat, lon, + time_method=time_method, + spatial_method=spatial_xr, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + arr = np.asarray(da.values) + return arr.squeeze().item() if arr.size else np.nan + + +def _prepare_vertical_nodes(levels: Sequence[Any], heights: Sequence[float], values: Sequence[float], surface_value=None, surface_height=None) -> pd.DataFrame: + rows = [] + for lev, z, val in zip(levels, heights, values): + zf, vf = _safe_float(z), _safe_float(val) + if np.isfinite(zf) and np.isfinite(vf): + rows.append({"level": lev, "height_m": zf, "value": vf, "is_surface": False}) + if surface_value is not None and surface_height is not None: + sv, sh = _safe_float(surface_value), _safe_float(surface_height) + if np.isfinite(sv) and np.isfinite(sh): + rows.append({"level": "surface", "height_m": sh, "value": sv, "is_surface": True}) + if not rows: + return pd.DataFrame(columns=["level", "height_m", "value", "is_surface"]) + return pd.DataFrame(rows).sort_values("height_m", kind="mergesort").reset_index(drop=True) + + +def vertical_sample( + levels: Sequence[Any], + heights_m: Sequence[float], + values: Sequence[float], + target_height_m: float, + *, + method: VerticalMethod = "nearest", + variable_type: VariableType = "continuous", + surface_value: Optional[float] = None, + surface_height_m: Optional[float] = None, + allow_extrapolation: bool = False, +) -> Tuple[float, Dict[str, Any]]: + if variable_type == "categorical": + method = "nearest" + H = _safe_float(target_height_m) + nodes = _prepare_vertical_nodes(levels, heights_m, values, surface_value, surface_height_m) + diag: Dict[str, Any] = { + "vertical_method": method, + "target_height_msl_m": H, + "matched_level": np.nan, + "matched_level_height_m": np.nan, + "height_difference_m": np.nan, + "lower_level": np.nan, + "upper_level": np.nan, + "lower_height_m": np.nan, + "upper_height_m": np.nan, + "vertical_weight_upper": np.nan, + "surface_anchor_used": False, + "vertical_out_of_range": False, + "vertical_warning": "", + } + if not np.isfinite(H): + diag["vertical_warning"] = "invalid_target_height" + return np.nan, diag + if nodes.empty: + diag["vertical_warning"] = "empty_vertical_profile" + return np.nan, diag + z = nodes["height_m"].to_numpy(dtype=float) + v = nodes["value"].to_numpy(dtype=float) + if method == "nearest" or len(nodes) == 1: + idx = int(np.nanargmin(np.abs(z - H))) + diag.update({ + "matched_level": nodes.loc[idx, "level"], + "matched_level_height_m": float(z[idx]), + "height_difference_m": float(H - z[idx]), + "surface_anchor_used": bool(nodes.loc[idx, "is_surface"]), + }) + return float(v[idx]), diag + if H < z[0]: + diag["vertical_out_of_range"] = True + if not allow_extrapolation: + diag["vertical_warning"] = "below_lowest_vertical_node" + return np.nan, diag + lo, hi = 0, min(1, len(z) - 1) + elif H > z[-1]: + diag["vertical_out_of_range"] = True + if not allow_extrapolation: + diag["vertical_warning"] = "above_highest_vertical_node" + return np.nan, diag + lo, hi = max(0, len(z) - 2), len(z) - 1 + else: + hi = int(np.searchsorted(z, H, side="left")) + if hi == 0: + lo = hi = 0 + elif hi < len(z) and np.isclose(z[hi], H): + lo = hi + else: + lo, hi = hi - 1, min(hi, len(z) - 1) + if lo == hi or np.isclose(z[lo], z[hi]): + val, w = float(v[lo]), 0.0 + else: + w = float((H - z[lo]) / (z[hi] - z[lo])) + val = float(v[lo] * (1 - w) + v[hi] * w) + nearest = lo if abs(H - z[lo]) <= abs(H - z[hi]) else hi + diag.update({ + "lower_level": nodes.loc[lo, "level"], + "upper_level": nodes.loc[hi, "level"], + "lower_height_m": float(z[lo]), + "upper_height_m": float(z[hi]), + "vertical_weight_upper": float(w), + "matched_level": nodes.loc[nearest, "level"], + "matched_level_height_m": float(z[nearest]), + "height_difference_m": float(H - z[nearest]), + "surface_anchor_used": bool(nodes.loc[lo, "is_surface"] or nodes.loc[hi, "is_surface"]), + }) + return val, diag + + +def sample_dem_elevation(dem_file: Optional[Union[str, Path]], lat: float, lon: float) -> Tuple[float, str]: + if not dem_file: + return np.nan, "dem_not_provided" + try: + import rasterio + from pyproj import Transformer + except Exception: + return np.nan, "rasterio_or_pyproj_not_available" + try: + path = _require_file(dem_file, "DEM file") + with rasterio.open(path) as src: + x, y = float(lon), float(lat) + if src.crs is not None and str(src.crs).upper() not in {"EPSG:4326", "OGC:CRS84"}: + x, y = Transformer.from_crs("EPSG:4326", src.crs, always_xy=True).transform(x, y) + row, col = src.index(x, y) + if row < 0 or col < 0 or row >= src.height or col >= src.width: + return np.nan, "point_outside_dem" + arr = src.read(1, window=((row, row + 1), (col, col + 1)), masked=True) + if np.ma.is_masked(arr) and bool(np.ma.getmaskarray(arr).squeeze()): + return np.nan, "dem_nodata" + val = float(np.asarray(arr).squeeze()) + if src.nodata is not None and np.isclose(val, float(src.nodata), equal_nan=True): + return np.nan, "dem_nodata" + return val, "" + except Exception as exc: + return np.nan, f"dem_sampling_failed:{exc}" + +def compute_dem_slope_aspect( + dem_file: Union[str, Path], + lat: float, + lon: float, + sample_radius_px: int = 1, +) -> Tuple[float, float]: + """ + Compute terrain slope and aspect at a given point from a DEM raster. + + Uses central-difference finite differences on the surrounding pixel + neighbourhood to estimate first-order partial derivatives of elevation. + + Args: + dem_file: Path to the DEM raster (GeoTIFF or similar). + lat, lon: Geographic coordinates of the query point (degrees). + sample_radius_px: Half-size of the pixel window used for finite + differences (default 1 = 3x3 window). + + Returns: + (slope_rad, aspect_rad) where + slope_rad -- terrain slope angle from horizontal (radians, 0..π/2). + aspect_rad -- upslope direction measured clockwise from North (radians, + 0..2π), i.e. the direction the slope faces. + Both values are NaN on error or where the DEM has no data. + """ + try: + import rasterio + from pyproj import Transformer + except Exception: + return np.nan, np.nan + + try: + path = _require_file(dem_file, "DEM file") + with rasterio.open(path) as src: + x, y = float(lon), float(lat) + if src.crs is not None and str(src.crs).upper() not in {"EPSG:4326", "OGC:CRS84"}: + x, y = Transformer.from_crs( + "EPSG:4326", src.crs, always_xy=True + ).transform(x, y) + + row_c, col_c = src.index(x, y) + r0 = max(0, row_c - sample_radius_px) + r1 = min(src.height, row_c + sample_radius_px + 1) + c0 = max(0, col_c - sample_radius_px) + c1 = min(src.width, col_c + sample_radius_px + 1) + if r1 - r0 < 2 or c1 - c0 < 2: + return np.nan, np.nan + + patch = src.read(1, window=((r0, r1), (c0, c1)), masked=True).astype(float) + if src.nodata is not None: + patch[patch == float(src.nodata)] = np.nan + + # Pixel size in metres + res_x = abs(src.transform.a) + res_y = abs(src.transform.e) + if src.crs is not None and src.crs.is_geographic: + # Convert arc-degrees to metres at this latitude + lat_rad = math.radians(float(lat)) + res_x = res_x * math.pi / 180.0 * 6_371_000.0 * math.cos(lat_rad) + res_y = res_y * math.pi / 180.0 * 6_371_000.0 + + rows, cols = patch.shape + cr, cc = rows // 2, cols // 2 + + # East-west gradient (positive = elevation increases eastward) + if 0 < cc < cols - 1: + dz_dx = (patch[cr, cc + 1] - patch[cr, cc - 1]) / (2.0 * res_x) + elif cc < cols - 1: + dz_dx = (patch[cr, cc + 1] - patch[cr, cc]) / res_x + else: + dz_dx = (patch[cr, cc] - patch[cr, cc - 1]) / res_x + + # North-south gradient (positive = elevation increases northward). + # Rasterio row index increases southward, so north = smaller row index. + if 0 < cr < rows - 1: + dz_dy = (patch[cr - 1, cc] - patch[cr + 1, cc]) / (2.0 * res_y) + elif cr < rows - 1: + dz_dy = (patch[cr, cc] - patch[cr + 1, cc]) / res_y + else: + dz_dy = (patch[cr - 1, cc] - patch[cr, cc]) / res_y + + if not (np.isfinite(dz_dx) and np.isfinite(dz_dy)): + return np.nan, np.nan + + slope = math.atan(math.sqrt(dz_dx**2 + dz_dy**2)) + # Aspect: direction the slope faces, clockwise from North. + # atan2(dz_dx, dz_dy) maps (east gradient, north gradient) + # to the bearing of the upslope direction. + aspect = (math.atan2(dz_dx, dz_dy) + 2.0 * math.pi) % (2.0 * math.pi) + return float(slope), float(aspect) + + except Exception: + return np.nan, np.nan + +def compute_orographic_uplift( + u10_ms: float, + v10_ms: float, + slope_rad: float, + aspect_rad: float, +) -> float: + """ + Orographic updraft velocity Wo (m/s) following Bohrer et al. (2012). + + Wo = V_surface * sin(slope) * cos(wind_from - aspect) + + where wind_from is the direction the wind is blowing *from* + (meteorological convention: easterly wind -> 90°). + + Positive Wo = wind blowing onto the upslope face -> updraft. + Negative Wo = wind blowing off the slope (lee side) -> downdraft. + + Args: + u10_ms: ERA5 10-metre U-component of wind (m/s, eastward positive). + v10_ms: ERA5 10-metre V-component of wind (m/s, northward positive). + slope_rad: Terrain slope angle in radians, from compute_dem_slope_aspect(). + aspect_rad: Upslope-facing direction clockwise from North (radians). + + Returns: + Wo in m/s, or NaN if any input is missing. + """ + u, v = float(u10_ms), float(v10_ms) + slope = float(slope_rad) + aspect = float(aspect_rad) + + if not all(np.isfinite([u, v, slope, aspect])): + return np.nan + + V = math.sqrt(u * u + v * v) + if V < 1e-6: + return 0.0 + + # Direction the wind is blowing FROM, clockwise from North (radians). + # atan2(u, v): u=east component, v=north component gives bearing from North. + wind_from = (math.atan2(u, v) + 2.0 * math.pi) % (2.0 * math.pi) + + return float(V * math.sin(slope) * math.cos(wind_from - aspect)) + +def geoid_undulation_geographiclib(lat: float, lon: float) -> Optional[float]: + global _GEOID_MODEL + try: + from geographiclib.geoid import Geoid + if _GEOID_MODEL is None: + _GEOID_MODEL = Geoid("egm2008") + llon = ((float(lon) + 180.0) % 360.0) - 180.0 + return float(_GEOID_MODEL.Height(float(lat), llon)) + except Exception: + return None + + +def geoid_undulation_pyproj(lat: float, lon: float, grid_path: Optional[Union[str, Path]]) -> Optional[float]: + if not grid_path: + return None + try: + from pyproj import CRS, Transformer + crs_geog_3d = CRS.from_epsg(4979) + pipeline = f"+proj=pipeline +step +proj=vgridshift +grids={_as_path(grid_path)} +multiplier=1" + transformer = Transformer.from_crs(crs_geog_3d, CRS.from_pipeline(pipeline), always_xy=True) + h0 = 100.0 + H = transformer.transform(float(lon), float(lat), h0)[2] + return float(h0 - H) + except Exception: + return None + + +def compute_orthometric_height( + raw_height_m: float, + lat: float, + lon: float, + *, + height_reference: HeightReference, + geoid_mode: GeoidMode, + constant_geoid_undulation_m: float, + geoid_grid_path: Optional[Union[str, Path]] = None, + terrain_elevation_m: Optional[float] = None, +) -> Tuple[float, Dict[str, Any]]: + h = _safe_float(raw_height_m) + diag: Dict[str, Any] = { + "height_input_m": h, + "height_reference": height_reference, + "geoid_mode": geoid_mode, + "geoid_undulation_m": np.nan, + "height_conversion_warning": "", + } + if not np.isfinite(h): + diag["height_conversion_warning"] = "invalid_height" + return np.nan, diag + if height_reference in ("already_orthometric", "already_msl"): + return h, diag + if height_reference == "agl": + terrain = np.nan if terrain_elevation_m is None else float(terrain_elevation_m) + if not np.isfinite(terrain): + diag["height_conversion_warning"] = "agl_height_without_valid_dem" + return np.nan, diag + return terrain + h, diag + N: Optional[float] = None + if geoid_mode == "geographiclib": + N = geoid_undulation_geographiclib(lat, lon) + elif geoid_mode == "pyproj_grid": + N = geoid_undulation_pyproj(lat, lon, geoid_grid_path) + elif geoid_mode == "constant": + N = float(constant_geoid_undulation_m) + elif geoid_mode == "none": + N = 0.0 + if N is None: + N = float(constant_geoid_undulation_m) + diag["height_conversion_warning"] = "geoid_lookup_failed_used_constant_N" + diag["geoid_undulation_m"] = float(N) + return h - float(N), diag + + +def _nearest_index(arr: np.ndarray, x: float) -> int: + arr = np.asarray(arr, dtype=float) + idx = int(np.searchsorted(arr, x)) + if idx <= 0: + return 0 + if idx >= len(arr): + return len(arr) - 1 + return idx if abs(arr[idx] - x) < abs(arr[idx - 1] - x) else idx - 1 + + +def _k_nearest_indices(glat: np.ndarray, glon: np.ndarray, lat: float, lon: float, k: int) -> List[Tuple[int, int]]: + if ae_k_nearest_indices is not None: + try: + return list(ae_k_nearest_indices(glat, glon, lat, lon, k)) + except Exception: + pass + i0 = _nearest_index(glat, lat) + j0 = _nearest_index(glon, lon) + r = int(np.ceil(max(1, np.sqrt(k)))) + candidates = [] + for ii in range(max(0, i0 - r), min(len(glat) - 1, i0 + r) + 1): + for jj in range(max(0, j0 - r), min(len(glon) - 1, j0 + r) + 1): + d = float(np.hypot(glat[ii] - lat, glon[jj] - lon)) + candidates.append((d, ii, jj)) + candidates.sort(key=lambda x: x[0]) + return [(ii, jj) for _, ii, jj in candidates[:k]] + + +def _idw(values: Sequence[float], distances: Sequence[float], p: float = 2.0) -> float: + if ae_idw is not None: + try: + return float(ae_idw(values, distances, p=p)) + except Exception: + pass + vals = np.asarray(values, dtype=float) + d = np.asarray(distances, dtype=float) + 1e-12 + mask = np.isfinite(vals) + if not mask.any(): + return np.nan + w = 1.0 / (d[mask] ** p) + return float(np.sum(vals[mask] * w) / np.sum(w)) + + +def _filter_boundary(df: pd.DataFrame, lat_col: str, lon_col: str, boundary_path: Optional[Union[str, Path]], bbox: Optional[Dict[str, float]]) -> pd.DataFrame: + out = df.copy() + if bbox: + S = float(bbox.get("S", bbox.get("south"))) + N = float(bbox.get("N", bbox.get("north"))) + W = float(bbox.get("W", bbox.get("west"))) + E = float(bbox.get("E", bbox.get("east"))) + return out[out[lat_col].between(S, N) & out[lon_col].between(W, E)].copy() + if not boundary_path: + return out + if gpd is None or Point is None: + raise RuntimeError("geopandas/shapely are required for boundary filtering.") + boundary = gpd.read_file(_require_file(boundary_path, "Boundary file")) + points = gpd.GeoDataFrame(out, geometry=[Point(xy) for xy in zip(out[lon_col], out[lat_col])], crs="EPSG:4326") + if boundary.crs != points.crs: + boundary = boundary.to_crs(points.crs) + clipped = gpd.sjoin(points, boundary[["geometry"]], predicate="within", how="inner").drop(columns=["index_right", "geometry"], errors="ignore") + return pd.DataFrame(clipped) + + +def _dataset_bounds(ds: xr.Dataset) -> Optional[Dict[str, float]]: + try: + return {"S": float(ds["lat"].min()), "N": float(ds["lat"].max()), "W": float(ds["lon"].min()), "E": float(ds["lon"].max())} + except Exception: + return None + + +def _load_movement(config: MultidimAnnotationConfig, ds_for_bbox: Optional[xr.Dataset] = None) -> pd.DataFrame: + path = _require_file(config.movement_csv, "Movement CSV") + df = pd.read_csv(path) + required = [config.id_col, config.time_col, config.lat_col, config.lon_col, config.height_col] + missing = [c for c in required if c not in df.columns] + if missing: + raise ValueError(f"Movement CSV is missing required column(s): {', '.join(missing)}") + if config.selected_ids: + ids = {str(x) for x in config.selected_ids} + df = df[df[config.id_col].astype(str).isin(ids)].copy() + df[config.time_col] = parse_movebank_timestamp_series(df[config.time_col], config.time_col) + df[config.lat_col] = pd.to_numeric(df[config.lat_col], errors="coerce") + df[config.lon_col] = pd.to_numeric(df[config.lon_col], errors="coerce") + df[config.height_col] = pd.to_numeric(df[config.height_col], errors="coerce") + df = df.dropna(subset=[config.time_col, config.lat_col, config.lon_col, config.height_col]).copy() + bbox = config.bbox or (_dataset_bounds(ds_for_bbox) if ds_for_bbox is not None and not config.boundary_path else None) + df = _filter_boundary(df, config.lat_col, config.lon_col, config.boundary_path, bbox) + return df.reset_index(drop=True) + + +def _time_range(ds: xr.Dataset) -> Tuple[pd.Timestamp, pd.Timestamp]: + vals = pd.to_datetime(ds["time"].values) + return pd.Timestamp(vals.min()), pd.Timestamp(vals.max()) + + +def _prefilter_time(df: pd.DataFrame, config: MultidimAnnotationConfig, required_datasets: Sequence[xr.Dataset]) -> pd.DataFrame: + starts, ends = [], [] + for ds in required_datasets: + if ds is None or "time" not in ds: + continue + try: + start, end = _time_range(ds) + starts.append(start) + ends.append(end) + except Exception: + pass + if not starts or not ends: + return df + start, end = max(starts), min(ends) + if start > end: + raise ValueError(f"Input NetCDF files have no overlapping time range: latest start={start}, earliest end={end}") + return df[df[config.time_col].between(start, end)].copy() + + +def _surface_height(config: MultidimAnnotationConfig, terrain: float) -> float: + if np.isfinite(terrain): + return float(terrain) + float(config.surface_height_agl_m) + return float(config.surface_height_agl_m) + + +def _slice_coord(ds: xr.Dataset, coord: str, low: float, high: float) -> xr.Dataset: + if coord not in ds.coords and coord not in ds.variables: + return ds + vals = np.asarray(ds[coord].values, dtype=float) + if vals.size < 2: + return ds + lo, hi = float(min(low, high)), float(max(low, high)) + try: + if vals[0] <= vals[-1]: + return ds.sel({coord: slice(lo, hi)}) + return ds.sel({coord: slice(hi, lo)}) + except Exception: + return ds + +def _slice_time_with_bracket(ds: xr.Dataset, tmin: pd.Timestamp, tmax: pd.Timestamp) -> xr.Dataset: + """ + Subset dataset by movement time range, but keep one neighbouring NetCDF + timestep before and after the movement range when possible. + + This is important for linear time interpolation: if movement timestamps fall + between two NetCDF timesteps, a strict slice(tmin, tmax) may remove the + required bracketing timesteps. + """ + if "time" not in ds.coords and "time" not in ds.variables: + return ds + + try: + times = pd.to_datetime(ds["time"].values) + if len(times) == 0: + return ds + + arr = times.to_numpy(dtype="datetime64[ns]") + start = np.datetime64(pd.Timestamp(tmin).to_datetime64()).astype("datetime64[ns]") + end = np.datetime64(pd.Timestamp(tmax).to_datetime64()).astype("datetime64[ns]") + + # Assumes time is sorted ascending, which should normally be true after open_dataset(). + left = int(np.searchsorted(arr, start, side="left")) + right = int(np.searchsorted(arr, end, side="right")) - 1 + + left = max(0, left - 1) + right = min(len(arr) - 1, right + 1) + + if right < left: + return ds + + return ds.isel({"time": slice(left, right + 1)}) + except Exception: + try: + return ds.sel({"time": slice(tmin, tmax)}) + except Exception: + return ds + +def subset_dataset_to_movement(ds: Optional[xr.Dataset], movement: pd.DataFrame, config: MultidimAnnotationConfig, buffer_deg: float = 1.0) -> Optional[xr.Dataset]: + if ds is None or movement.empty: + return ds + out = ds + try: + if "time" in out.coords and config.time_col in movement.columns: + tmin = pd.Timestamp(movement[config.time_col].min()) + tmax = pd.Timestamp(movement[config.time_col].max()) + out = _slice_time_with_bracket(out, tmin, tmax) + except Exception: + pass + try: + if "lat" in out.coords and config.lat_col in movement.columns: + lat_min = float(movement[config.lat_col].min()) - buffer_deg + lat_max = float(movement[config.lat_col].max()) + buffer_deg + out = _slice_coord(out, "lat", lat_min, lat_max) + except Exception: + pass + try: + if "lon" in out.coords and config.lon_col in movement.columns: + lon_vals = np.asarray(out["lon"].values, dtype=float) + lon_series = movement[config.lon_col].astype(float).map(lambda x: _wrap_lon(float(x), lon_vals)) + lon_min = float(lon_series.min()) - buffer_deg + lon_max = float(lon_series.max()) + buffer_deg + if lon_max - lon_min < 350: + out = _slice_coord(out, "lon", lon_min, lon_max) + except Exception: + pass + return out + + +def _geopotential_cache_key(t: pd.Timestamp, lat: float, lon: float, time_method: str, fixed_lat: Optional[float], fixed_lon: Optional[float]) -> Tuple[Any, ...]: + lat_key = round(float(lat if fixed_lat is None else fixed_lat), 6) + lon_key = round(float(lon if fixed_lon is None else fixed_lon), 6) + t_key = pd.Timestamp(t).to_datetime64() + return (t_key, lat_key, lon_key, str(time_method)) + + +def _get_geopotential_profile_cached( + ds_geo: xr.Dataset, + geo_name: str, + row: pd.Series, + config: MultidimAnnotationConfig, + variable_type: VariableType, + cache: Optional[Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]]] = None, + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Tuple[np.ndarray, np.ndarray]: + t = pd.Timestamp(row[config.time_col]) + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + key = _geopotential_cache_key(t, lat, lon, time_method, fixed_lat, fixed_lon) + if cache is not None and key in cache: + return cache[key] + levels_geo, heights = sample_geopotential_profile( + ds_geo, + geo_name, + t=t, + lat=lat, + lon=lon, + units_override=config.geopotential_units, + convert_geopotential_to_height=config.convert_geopotential_to_height, + gravity_constant=config.gravity_constant, + time_method=time_method, + spatial_method="nearest", + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + if cache is not None: + cache[key] = (levels_geo, heights) + return levels_geo, heights + + +def _sample_var_at_cell( + ds_var: xr.Dataset, + var_name: str, + ds_geo: xr.Dataset, + geo_name: str, + row: pd.Series, + config: MultidimAnnotationConfig, + target_height: float, + terrain: float, + variable_type: VariableType, + surface_value: Optional[float] = None, + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, + geo_cache: Optional[Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]]] = None, +) -> Tuple[float, Dict[str, Any]]: + t = pd.Timestamp(row[config.time_col]) + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + spatial_xr: Literal["nearest", "linear"] = "nearest" + + levels_geo, heights = _get_geopotential_profile_cached( + ds_geo, + geo_name, + row, + config, + variable_type, + geo_cache, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + levels_var, values = sample_level_profile( + ds_var, + var_name, + t=t, + lat=lat, + lon=lon, + time_method=time_method, + spatial_method=spatial_xr, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + if len(levels_geo) != len(levels_var) or not np.array_equal(np.asarray(levels_geo), np.asarray(levels_var)): + geo_map = {str(k): v for k, v in zip(levels_geo, heights)} + heights_for_values = np.asarray([geo_map.get(str(lev), np.nan) for lev in levels_var], dtype=float) + else: + heights_for_values = np.asarray(heights, dtype=float) + sv = surface_value if (config.use_surface_as_lower_anchor and variable_type == "continuous") else None + sh = _surface_height(config, terrain) if sv is not None else None + return vertical_sample( + levels_var, + heights_for_values, + values, + target_height, + method=config.vertical_method, + variable_type=variable_type, + surface_value=sv, + surface_height_m=sh, + allow_extrapolation=config.allow_vertical_extrapolation, + ) + + +def _sample_multilevel( + ds_var: xr.Dataset, + var_name: str, + ds_geo: xr.Dataset, + geo_name: str, + row: pd.Series, + config: MultidimAnnotationConfig, + target_height: float, + terrain: float, + variable_type: VariableType, + surface_value: Optional[float] = None, + geo_cache: Optional[Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]]] = None, +) -> Tuple[float, Dict[str, Any]]: + if variable_type == "categorical" or config.spatial_method == "nearest": + return _sample_var_at_cell( + ds_var, + var_name, + ds_geo, + geo_name, + row, + config, + target_height, + terrain, + variable_type, + surface_value, + geo_cache=geo_cache, + ) + + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + glat = np.asarray(ds_var["lat"].values, dtype=float) + glon = np.asarray(ds_var["lon"].values, dtype=float) + lon_adj = _wrap_lon(lon, glon) + k = max(2, int(config.smoothing_k)) + samples, dists, last_diag = [], [], {} + for ii, jj in _k_nearest_indices(glat, glon, lat, lon_adj, k): + flat, flon = float(glat[ii]), float(glon[jj]) + val, diag = _sample_var_at_cell( + ds_var, + var_name, + ds_geo, + geo_name, + row, + config, + target_height, + terrain, + variable_type, + surface_value, + fixed_lat=flat, + fixed_lon=flon, + geo_cache=geo_cache, + ) + samples.append(val) + dists.append(float(np.hypot(flat - lat, flon - lon_adj))) + last_diag = diag + return _idw(samples, dists), last_diag + + +def _bearing_deg(lat1: float, lon1: float, lat2: float, lon2: float) -> float: + if not all(np.isfinite([lat1, lon1, lat2, lon2])): + return np.nan + phi1, phi2 = math.radians(lat1), math.radians(lat2) + dlon = math.radians(lon2 - lon1) + x = math.sin(dlon) * math.cos(phi2) + y = math.cos(phi1) * math.sin(phi2) - math.sin(phi1) * math.cos(phi2) * math.cos(dlon) + return (math.degrees(math.atan2(x, y)) + 360.0) % 360.0 + + +def add_track_bearing(df: pd.DataFrame, id_col: str, time_col: str, lat_col: str, lon_col: str, heading_col: Optional[str], heading_source: str) -> pd.DataFrame: + out = df.copy() + if heading_source == "column" and heading_col and heading_col in out.columns: + out["track_bearing_deg"] = pd.to_numeric(out[heading_col], errors="coerce") + return out + out["track_bearing_deg"] = np.nan + sort_cols = [id_col, time_col] if id_col in out.columns else [time_col] + work = out.sort_values(sort_cols) + groups = work.groupby(id_col, dropna=False, sort=False) if id_col in work.columns else [(None, work)] + for _, group in groups: + idxs = list(group.index) + for i, idx in enumerate(idxs): + if i < len(idxs) - 1: + nxt = idxs[i + 1] + b = _bearing_deg(out.at[idx, lat_col], out.at[idx, lon_col], out.at[nxt, lat_col], out.at[nxt, lon_col]) + elif i > 0: + prv = idxs[i - 1] + b = _bearing_deg(out.at[prv, lat_col], out.at[prv, lon_col], out.at[idx, lat_col], out.at[idx, lon_col]) + else: + b = np.nan + out.at[idx, "track_bearing_deg"] = b + return out + + +def add_wind_metrics(df: pd.DataFrame, u_col: str = "td_u_at_height", v_col: str = "td_v_at_height") -> pd.DataFrame: + out = df.copy() + u = pd.to_numeric(out[u_col], errors="coerce") if u_col in out.columns else pd.Series(np.nan, index=out.index) + v = pd.to_numeric(out[v_col], errors="coerce") if v_col in out.columns else pd.Series(np.nan, index=out.index) + out["wind_speed_ms"] = np.sqrt(u * u + v * v) + wind_to = (np.degrees(np.arctan2(u, v)) + 360.0) % 360.0 + out["wind_to_direction_deg"] = wind_to + out["wind_from_direction_deg"] = (wind_to + 180.0) % 360.0 + if "track_bearing_deg" in out.columns: + theta = np.radians(pd.to_numeric(out["track_bearing_deg"], errors="coerce")) + out["wind_support_ms"] = u * np.sin(theta) + v * np.cos(theta) + out["crosswind_ms"] = u * np.cos(theta) - v * np.sin(theta) + return out + + +def _var_type(var: str, spec: DatasetSpec) -> VariableType: + return "categorical" if var in set(spec.categorical or []) else "continuous" + +def _fast_required_dims(da: xr.DataArray, required: Sequence[str], variable: str) -> xr.DataArray: + """ + Prepare a DataArray for fast numpy sampling. + + Only singleton non-standard dimensions are dropped. If a variable has + genuinely extra dimensions, fast mode refuses it and the caller can fall + back to the old xarray-based algorithm. + """ + out = da + + for dim in list(out.dims): + if dim not in {"time", "level", "lat", "lon"}: + if int(out.sizes.get(dim, 0)) == 1: + out = out.isel({dim: 0}, drop=True) + else: + raise ValueError( + f"Fast mode does not support variable '{variable}' with extra dimension '{dim}'." + ) + + missing = [d for d in required if d not in out.dims] + if missing: + raise ValueError( + f"Fast mode requires variable '{variable}' to have dimensions: {required}. " + f"Missing: {missing}. Actual dims: {list(out.dims)}" + ) + + return out + + +def _fast_open_4d(ds: xr.Dataset, variable: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + Return variable as numpy array with shape: + time, level, lat, lon + """ + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in dataset.") + + da = _fast_required_dims(ds[variable], ("time", "level", "lat", "lon"), variable) + da = da.transpose("time", "level", "lat", "lon") + + arr = np.asarray(da.load().values, dtype=float) + times = pd.to_datetime(da["time"].values).to_numpy(dtype="datetime64[ns]") + levels = np.asarray(da["level"].values) + lat = np.asarray(da["lat"].values, dtype=float) + lon = np.asarray(da["lon"].values, dtype=float) + + return arr, times, levels, lat, lon + + +def _fast_open_3d(ds: xr.Dataset, variable: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + Return surface variable as numpy array with shape: + time, lat, lon + """ + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in dataset.") + + da = _fast_required_dims(ds[variable], ("time", "lat", "lon"), variable) + da = da.transpose("time", "lat", "lon") + + arr = np.asarray(da.load().values, dtype=float) + times = pd.to_datetime(da["time"].values).to_numpy(dtype="datetime64[ns]") + lat = np.asarray(da["lat"].values, dtype=float) + lon = np.asarray(da["lon"].values, dtype=float) + + return arr, times, lat, lon + + +def _fast_time_index_weight( + times: np.ndarray, + target: pd.Timestamp, + *, + method: Literal["nearest", "linear"], +) -> Tuple[int, int, float]: + """ + Return t0, t1, weight for fast temporal sampling. + + For nearest: + value = arr[t0] + For linear: + value = arr[t0] * (1 - w) + arr[t1] * w + """ + if len(times) == 0: + raise ValueError("Cannot sample dataset with empty time coordinate.") + + arr = np.asarray(times).astype("datetime64[ns]") + target64 = np.datetime64(pd.Timestamp(target).to_datetime64()).astype("datetime64[ns]") + + if method == "nearest" or len(arr) == 1: + diffs = np.abs(arr.astype("int64") - target64.astype("int64")) + idx = int(np.nanargmin(diffs)) + return idx, idx, 0.0 + + right = int(np.searchsorted(arr, target64, side="left")) + + if right <= 0: + return 0, 0, 0.0 + if right >= len(arr): + last = len(arr) - 1 + return last, last, 0.0 + if arr[right] == target64: + return right, right, 0.0 + + left = right - 1 + t0 = arr[left].astype("int64") + t1 = arr[right].astype("int64") + tt = target64.astype("int64") + + if t1 == t0: + return left, right, 0.0 + + w = float((tt - t0) / (t1 - t0)) + return left, right, w + + +def _fast_nearest_lat_lon_indices( + lat_values: np.ndarray, + lon_values: np.ndarray, + lat: float, + lon: float, +) -> Tuple[int, int]: + lon_adj = _wrap_lon(float(lon), lon_values) + yi = _nearest_index(np.asarray(lat_values, dtype=float), float(lat)) + xi = _nearest_index(np.asarray(lon_values, dtype=float), lon_adj) + return int(yi), int(xi) + + +def _fast_sample_4d_profile( + arr: np.ndarray, + times: np.ndarray, + lat_values: np.ndarray, + lon_values: np.ndarray, + *, + t: pd.Timestamp, + lat: float, + lon: float, + variable_type: VariableType, +) -> np.ndarray: + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + t0, t1, w = _fast_time_index_weight(times, t, method=time_method) + yi, xi = _fast_nearest_lat_lon_indices(lat_values, lon_values, lat, lon) + + v0 = arr[t0, :, yi, xi] + if t1 == t0 or w == 0.0: + return np.asarray(v0, dtype=float) + + v1 = arr[t1, :, yi, xi] + return np.asarray(v0 * (1.0 - w) + v1 * w, dtype=float) + + +def _fast_sample_3d_value( + arr: np.ndarray, + times: np.ndarray, + lat_values: np.ndarray, + lon_values: np.ndarray, + *, + t: pd.Timestamp, + lat: float, + lon: float, + variable_type: VariableType, +) -> float: + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + t0, t1, w = _fast_time_index_weight(times, t, method=time_method) + yi, xi = _fast_nearest_lat_lon_indices(lat_values, lon_values, lat, lon) + + v0 = float(arr[t0, yi, xi]) + if t1 == t0 or w == 0.0: + return v0 + + v1 = float(arr[t1, yi, xi]) + return float(v0 * (1.0 - w) + v1 * w) + + +def _fast_heights_for_variable_levels( + geo_levels: np.ndarray, + geo_heights: np.ndarray, + var_levels: np.ndarray, +) -> np.ndarray: + """ + Match geopotential-derived heights to variable levels. + + If levels are identical and in the same order, return heights directly. + Otherwise match by string representation of the level coordinate. + """ + if len(geo_levels) == len(var_levels) and np.array_equal(np.asarray(geo_levels), np.asarray(var_levels)): + return np.asarray(geo_heights, dtype=float) + + geo_map = {str(k): v for k, v in zip(geo_levels, geo_heights)} + return np.asarray([geo_map.get(str(lev), np.nan) for lev in var_levels], dtype=float) + +_RHO_AIR = 1.225 # kg/m³, standard sea-level air density +_CP_AIR = 1005.0 # J/(kg·K), specific heat of dry air at constant pressure +_G = 9.80665 # m/s², gravitational acceleration +_R_DRY = 287.05 # J/(kg·K), specific gas constant for dry air + +def compute_thermal_updraft_w_star( + surface_heat_flux_wm2: float, + boundary_layer_height_m: float, + temperature_2m_K: float, +) -> float: + """ + Deardorff convective velocity scale w* (m/s). + + Standard measure of thermal updraft intensity used by Movebank ENV-DATA + and described in Bohrer et al. (2012, Ecology Letters). + + w* = (g/T * (H / (rho * cp)) * zi) ^ (1/3) + + Args: + surface_heat_flux_wm2: ERA5 surface sensible heat flux (W/m²). + Positive = surface heating the atmosphere = uplift. + boundary_layer_height_m: ERA5 planetary boundary layer height (m). + temperature_2m_K: ERA5 2-metre temperature (K), used as a proxy + for surface potential temperature. + + Returns: + w* in m/s. Returns 0.0 when heat flux <= 0 (no convection). + Returns NaN when any input is missing or physically invalid. + """ + H = float(surface_heat_flux_wm2) + zi = float(boundary_layer_height_m) + T = float(temperature_2m_K) + + if not (np.isfinite(H) and np.isfinite(zi) and np.isfinite(T)): + return np.nan + if T <= 0.0 or zi <= 0.0: + return np.nan + if H <= 0.0: + return 0.0 # stable or neutral atmosphere: no convective uplift + + H_kinematic = H / (_RHO_AIR * _CP_AIR) # kinematic heat flux (K·m/s) + w_star_cubed = (_G / T) * H_kinematic * zi + return float(w_star_cubed ** (1.0 / 3.0)) + +def _finalize_and_save_annotation_output(out: pd.DataFrame, config: MultidimAnnotationConfig) -> pd.DataFrame: + """Finalize derived metrics, save CSV output, and return the annotated DataFrame.""" + + if config.derive_wind_speed_direction or config.derive_wind_support_crosswind: + if "td_u_at_height" in out.columns and "td_v_at_height" in out.columns: + if config.derive_wind_support_crosswind: + out = add_track_bearing( + out, + config.id_col, + config.time_col, + config.lat_col, + config.lon_col, + config.heading_col, + config.heading_source, + ) + out = add_wind_metrics(out) + + # --- Vertical motion: convert ERA5 omega (Pa/s) to geometric w (m/s) --- + if config.derive_vertical_motion and "td_w_at_height" in out.columns: + omega = pd.to_numeric(out["td_w_at_height"], errors="coerce") + + has_temp = "td_temperature_at_height" in out.columns + # The matched pressure level is stored in hPa by vertical_sample diagnostics. + # Column name pattern: _matched_level_height_m is the height; + # we need the pressure level itself which vertical_sample stores as matched_level. + level_col = next( + (c for c in out.columns if c.endswith("_matched_level") and "height" not in c), + None, + ) + has_level = level_col is not None + + if has_temp and has_level: + T_K = pd.to_numeric(out["td_temperature_at_height"], errors="coerce") + P_Pa = pd.to_numeric(out[level_col], errors="coerce") * 100.0 # hPa -> Pa + rho = P_Pa / (_R_DRY * T_K) + out["vertical_motion_ms"] = -omega / (rho * _G) + out["vertical_motion_omega_Pa_s"] = omega + out["vertical_motion_note"] = ( + "vertical_motion_ms: ERA5 omega (Pa/s) converted to geometric " + "vertical velocity (m/s) via w = -omega / (rho * g), " + "rho = P / (R_dry * T). Positive = upward." + ) + else: + # Fall back to a standard-atmosphere approximation (rho ~ 1.0 kg/m³) + # valid roughly between 1 and 10 km altitude. + out["vertical_motion_ms"] = -omega / (1.0 * _G) + out["vertical_motion_omega_Pa_s"] = omega + out["vertical_motion_note"] = ( + "WARNING: vertical_motion_ms estimated with rho=1.0 kg/m3 " + "(standard atmosphere approximation). For accurate conversion " + "provide temperature and pressure level data. Positive = upward." + ) + + # --- Thermal updraft: Deardorff w* --- + if config.derive_thermal_proxy: + has_shf = "surface_surface_sensible_heat_flux" in out.columns + has_blh = "surface_boundary_layer_height" in out.columns + has_t2m = "surface_2m_temperature" in out.columns + + if has_shf and has_blh and has_t2m: + out["thermal_updraft_w_star_ms"] = [ + compute_thermal_updraft_w_star( + row["surface_surface_sensible_heat_flux"], + row["surface_boundary_layer_height"], + row["surface_2m_temperature"], + ) + for _, row in out.iterrows() + ] + out["thermal_updraft_note"] = ( + "Deardorff convective velocity scale w* (m/s). " + "Positive = convective uplift available. " + "Method: Bohrer et al. 2012 / Movebank ENV-DATA." + ) + elif "td_temperature_at_height" in out.columns: + out["temperature_at_height_K"] = out["td_temperature_at_height"] + out["thermal_updraft_note"] = ( + "WARNING: w* not computed. Requires surface variables: " + "surface_sensible_heat_flux, boundary_layer_height, 2m_temperature. " + "Storing raw temperature at flight height instead." + ) + + # --- Orographic uplift: Bohrer et al. 2012 --- + if config.derive_orographic_uplift and config.dem_file: + has_u10 = "surface_u_component_of_wind_10m" in out.columns + has_v10 = "surface_v_component_of_wind_10m" in out.columns + + if has_u10 and has_v10: + slopes_aspects = [ + compute_dem_slope_aspect( + config.dem_file, + float(row[config.lat_col]), + float(row[config.lon_col]), + ) + for _, row in out.iterrows() + ] + out["orographic_uplift_ms"] = [ + compute_orographic_uplift( + float(row["surface_u_component_of_wind_10m"]), + float(row["surface_v_component_of_wind_10m"]), + sa[0], + sa[1], + ) + for (_, row), sa in zip(out.iterrows(), slopes_aspects) + ] + out["orographic_uplift_note"] = ( + "Wo = V_surface * sin(slope) * cos(wind_from - aspect) (m/s). " + "Method: Bohrer et al. 2012 / Movebank ENV-DATA. " + "Positive = updraft on windward slope." + ) + else: + out["orographic_uplift_ms"] = np.nan + out["orographic_uplift_note"] = ( + "WARNING: orographic uplift not computed. " + "Add surface_u_component_of_wind_10m and " + "surface_v_component_of_wind_10m as surface variables." + ) + + output_path = _as_path(config.output_csv) + output_path.parent.mkdir(parents=True, exist_ok=True) + out.to_csv(output_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") + + if config.save_per_individual and config.id_col in out.columns: + per_dir = output_path.with_suffix("").parent / f"{output_path.stem}_by_individual" + per_dir.mkdir(parents=True, exist_ok=True) + for ident, group in out.groupby(config.id_col, dropna=False): + safe = re.sub(r"[^\w\-]", "_", str(ident).strip()) or "unknown" + group.to_csv( + per_dir / f"{safe}.csv", + index=False, + encoding="utf-8-sig", + date_format="%Y-%m-%d %H:%M:%S", + ) + + return out + + +class _FastSampler: + """ + Samples pre-loaded numpy arrays directly. + Used for spatial nearest-neighbour mode. + """ + + def __init__( + self, + ds_geo: xr.Dataset, + ds_main: xr.Dataset, + ds_surface: Optional[xr.Dataset], + ds_u: Optional[xr.Dataset], + ds_v: Optional[xr.Dataset], + ds_w: Optional[xr.Dataset], + ds_t: Optional[xr.Dataset], + config: MultidimAnnotationConfig, + ) -> None: + self._config = config + self._geo_arr, self._geo_times, self._geo_levels, self._geo_lat, self._geo_lon = ( + _fast_open_4d(ds_geo, config.geopotential_variable) + ) + + if config.convert_geopotential_to_height: + units = ( + config.geopotential_units + or ds_geo[config.geopotential_variable].attrs.get("units") + or "" + ).lower().replace("**", "^").replace("/", " ") + is_height = units.strip() in {"m", "meter", "meters", "metre", "metres"} + if not is_height: + self._geo_arr = self._geo_arr / float(config.gravity_constant) + + main_vars = _unique( + config.multilevel.continuous, + config.multilevel.categorical, + config.multilevel.variables, + ) + self._main_arrays: Dict[str, tuple] = { + var: _fast_open_4d(ds_main, var) for var in main_vars + } + + self._surface_arrays: Dict[str, tuple] = {} + if config.surface and ds_surface is not None: + surface_vars = _unique( + config.surface.continuous, + config.surface.categorical, + config.surface.variables, + ) + for var in surface_vars: + self._surface_arrays[var] = _fast_open_3d(ds_surface, var) + + self._component_arrays: List[Tuple[str, OptionalComponentSpec, tuple]] = [] + for label, ds, spec in ( + ("u", ds_u, config.u_component), + ("v", ds_v, config.v_component), + ("w", ds_w, config.w_component), + ("temperature", ds_t, config.temperature_component), + ): + if ds is not None and spec.variable: + self._component_arrays.append((label, spec, _fast_open_4d(ds, spec.variable))) + + def geo_profile(self, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> np.ndarray: + return _fast_sample_4d_profile( + self._geo_arr, self._geo_times, self._geo_lat, self._geo_lon, + t=t, lat=lat, lon=lon, variable_type=vtype, + ) + + def var_profile(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> Tuple[np.ndarray, np.ndarray]: + arr, times, levels, lat_vals, lon_vals = self._main_arrays[var] + values = _fast_sample_4d_profile( + arr, times, lat_vals, lon_vals, + t=t, lat=lat, lon=lon, variable_type=vtype, + ) + return levels, values + + def component_profile(self, label: str, t: pd.Timestamp, lat: float, lon: float) -> Tuple[np.ndarray, np.ndarray]: + for lbl, _spec, arr_info in self._component_arrays: + if lbl == label: + arr, times, levels, lat_vals, lon_vals = arr_info + values = _fast_sample_4d_profile( + arr, times, lat_vals, lon_vals, + t=t, lat=lat, lon=lon, variable_type="continuous", + ) + return levels, values + raise KeyError(label) + + def surface_value(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> float: + s_arr, s_times, s_lat, s_lon = self._surface_arrays[var] + return _fast_sample_3d_value( + s_arr, s_times, s_lat, s_lon, + t=t, lat=lat, lon=lon, variable_type=vtype, + ) + + @property + def geo_levels(self) -> np.ndarray: + return self._geo_levels + + @property + def component_specs(self) -> List[Tuple[str, OptionalComponentSpec]]: + return [(label, spec) for label, spec, _ in self._component_arrays] + + +class _XarraySampler: + """ + Samples via xarray .sel/.interp on every point. + Used as fallback or for IDW spatial mode. + """ + + def __init__( + self, + ds_geo: xr.Dataset, + ds_main: xr.Dataset, + ds_surface: Optional[xr.Dataset], + ds_u: Optional[xr.Dataset], + ds_v: Optional[xr.Dataset], + ds_w: Optional[xr.Dataset], + ds_t: Optional[xr.Dataset], + config: MultidimAnnotationConfig, + ) -> None: + self._config = config + self._ds_geo = ds_geo + self._ds_main = ds_main + self._ds_surface = ds_surface + self._ds_components: Dict[str, Tuple[xr.Dataset, OptionalComponentSpec]] = {} + for label, ds, spec in ( + ("u", ds_u, config.u_component), + ("v", ds_v, config.v_component), + ("w", ds_w, config.w_component), + ("temperature", ds_t, config.temperature_component), + ): + if ds is not None and spec.variable: + self._ds_components[label] = (ds, spec) + + self._geo_cache: Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]] = {} + + def geo_profile(self, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> np.ndarray: + _, heights = self._get_geo_cached(t, lat, lon, vtype) + return heights + + def _get_geo_cached(self, t, lat, lon, vtype) -> Tuple[np.ndarray, np.ndarray]: + key = _geopotential_cache_key(t, lat, lon, "nearest" if vtype == "categorical" else "linear", None, None) + if key not in self._geo_cache: + levels, heights = sample_geopotential_profile( + self._ds_geo, + self._config.geopotential_variable, + t=t, lat=lat, lon=lon, + units_override=self._config.geopotential_units, + convert_geopotential_to_height=self._config.convert_geopotential_to_height, + gravity_constant=self._config.gravity_constant, + time_method="nearest" if vtype == "categorical" else "linear", + spatial_method="nearest", + ) + self._geo_cache[key] = (levels, heights) + return self._geo_cache[key] + + def var_profile(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> Tuple[np.ndarray, np.ndarray]: + return sample_level_profile( + self._ds_main, var, + t=t, lat=lat, lon=lon, + time_method="nearest" if vtype == "categorical" else "linear", + spatial_method="nearest", + ) + + def component_profile(self, label: str, t: pd.Timestamp, lat: float, lon: float) -> Tuple[np.ndarray, np.ndarray]: + ds, spec = self._ds_components[label] + return sample_level_profile( + ds, spec.variable, + t=t, lat=lat, lon=lon, + time_method="linear", + spatial_method="nearest", + ) + + def surface_value(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> float: + return sample_surface_value( + self._ds_surface, var, + t=t, lat=lat, lon=lon, + variable_type=vtype, + spatial_method=self._config.spatial_method, + ) + + @property + def geo_levels(self) -> np.ndarray: + return np.asarray(self._ds_geo["level"].values) + + @property + def component_specs(self) -> List[Tuple[str, OptionalComponentSpec]]: + return [(label, spec) for label, (_, spec) in self._ds_components.items()] + + def sample_at_height( + self, + var: str, + t: pd.Timestamp, + lat: float, + lon: float, + vtype: VariableType, + target_height: float, + terrain: float, + surface_value: Optional[float], + config: MultidimAnnotationConfig, + ) -> Tuple[float, Dict[str, Any]]: + """IDW spatial sampling: delegates to _sample_multilevel which handles + k-nearest neighbours and inverse distance weighting internally.""" + row = pd.Series({ + config.time_col: t, + config.lat_col: lat, + config.lon_col: lon, + }) + return _sample_multilevel( + self._ds_main, + var, + self._ds_geo, + config.geopotential_variable, + row, + config, + target_height, + terrain, + vtype, + surface_value, + geo_cache=self._geo_cache, + ) + +def _process_single_point( + idx: Any, + row: pd.Series, + sampler: Union[_FastSampler, _XarraySampler], + config: MultidimAnnotationConfig, + out: pd.DataFrame, + main_vars: List[str], + surface_vars: List[str], + surface_anchor_var: Optional[str], +) -> Dict[str, Any]: + """ + Annotate one movement point. Writes results into `out` in-place. + Returns diagnostics dict for this point. + """ + warnings_for_point: List[str] = [] + t = pd.Timestamp(row[config.time_col]) + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + + # --- Terrain --- + if config.dem_file: + terrain, dem_warning = sample_dem_elevation(config.dem_file, lat, lon) + if dem_warning and dem_warning != "dem_not_provided": + warnings_for_point.append(dem_warning) + else: + terrain, dem_warning = np.nan, "" + out.at[idx, "terrain_elevation_m"] = terrain + + # --- Height conversion --- + height_msl, hdiag = compute_orthometric_height( + row[config.height_col], lat, lon, + height_reference=config.height_reference, + geoid_mode=config.geoid_mode, + constant_geoid_undulation_m=config.constant_geoid_undulation_m, + geoid_grid_path=config.geoid_grid_path, + terrain_elevation_m=terrain, + ) + out.at[idx, "height_msl_m"] = height_msl + if np.isfinite(height_msl) and np.isfinite(terrain): + out.at[idx, "height_agl_m"] = height_msl - terrain + if hdiag.get("height_conversion_warning"): + warnings_for_point.append(str(hdiag["height_conversion_warning"])) + + row_diag: Dict[str, Any] = {**hdiag, "dem_warning": dem_warning} + + # --- Surface variables --- + surface_values: Dict[str, Any] = {} + for var in surface_vars: + if config.surface is None: + continue + vtype = _var_type(var, config.surface) + try: + sval = sampler.surface_value(var, t, lat, lon, vtype) + out.at[idx, f"surface_{var}"] = sval + surface_values[var] = sval + except Exception as exc: + warnings_for_point.append(f"surface_{var}_failed:{exc}") + + # Geopotential profile + geo_profile_cache: Dict[VariableType, np.ndarray] = {} + + def _get_geo_heights(vtype: VariableType) -> np.ndarray: + if vtype not in geo_profile_cache: + geo_profile_cache[vtype] = sampler.geo_profile(t, lat, lon, vtype) + return geo_profile_cache[vtype] + + # --- Main multilevel variables --- + + for var in main_vars: + vtype = _var_type(var, config.multilevel) + try: + anchor = surface_values.get(surface_anchor_var) if ( + surface_anchor_var and vtype == "continuous" and config.use_surface_as_lower_anchor + ) else None + + if hasattr(sampler, "sample_at_height") and config.spatial_method != "nearest": + # IDW: horizontal and vertical sampling together + val, diag = sampler.sample_at_height( + var, t, lat, lon, vtype, height_msl, terrain, anchor, config, + ) + else: + # Nearest: first profile, then vertical interpolation + var_levels, values = sampler.var_profile(var, t, lat, lon, vtype) + geo_heights = _get_geo_heights(vtype) + heights_for_values = _fast_heights_for_variable_levels( + sampler.geo_levels, geo_heights, var_levels, + ) + sh = _surface_height(config, terrain) if anchor is not None else None + val, diag = vertical_sample( + var_levels, heights_for_values, values, height_msl, + method=config.vertical_method, + variable_type=vtype, + surface_value=anchor, + surface_height_m=sh, + allow_extrapolation=config.allow_vertical_extrapolation, + ) + + out.at[idx, f"td_{var}_at_height"] = val + for k, v in diag.items(): + row_diag[f"{var}_{k}"] = v + if diag.get("vertical_warning"): + warnings_for_point.append(f"{var}:{diag['vertical_warning']}") + + except Exception as exc: + warnings_for_point.append(f"{var}_sampling_failed:{exc}") + + # --- Wind/temperature components --- + for label, _spec in sampler.component_specs: + try: + comp_levels, values = sampler.component_profile(label, t, lat, lon) + geo_heights = _get_geo_heights("continuous") + heights_for_values = _fast_heights_for_variable_levels( + sampler.geo_levels, geo_heights, comp_levels, + ) + val, diag = vertical_sample( + comp_levels, heights_for_values, values, height_msl, + method=config.vertical_method, + variable_type="continuous", + allow_extrapolation=config.allow_vertical_extrapolation, + ) + out.at[idx, f"td_{label}_at_height"] = val + if config.keep_diagnostics: + for k, v in diag.items(): + row_diag[f"{label}_{k}"] = v + if diag.get("vertical_warning"): + warnings_for_point.append(f"{label}:{diag['vertical_warning']}") + + except Exception as exc: + warnings_for_point.append(f"{label}_sampling_failed:{exc}") + + out.at[idx, "annotation_warning"] = ";".join(w for w in warnings_for_point if w) + return row_diag + +def run_multidimensional_annotation(config: MultidimAnnotationConfig) -> pd.DataFrame: + config.vertical_method = _normalize_vertical_method(config.vertical_method) # type: ignore[assignment] + config.spatial_method = _normalize_spatial_method(config.spatial_method) # type: ignore[assignment] + if config.spatial_method == "nearest": + config.smoothing_k = 1 + + ds_geo = open_dataset(config.geopotential_file, config.coord_spec) + ds_main = open_dataset(config.multilevel.path, config.coord_spec) + ds_surface = open_dataset(config.surface.path, config.coord_spec) if config.surface else None + ds_u = open_dataset(config.u_component.path, config.coord_spec) if config.u_component.is_enabled() else None + ds_v = open_dataset(config.v_component.path, config.coord_spec) if config.v_component.is_enabled() else None + ds_w = open_dataset(config.w_component.path, config.coord_spec) if config.w_component.is_enabled() else None + ds_t = open_dataset(config.temperature_component.path, config.coord_spec) if config.temperature_component.is_enabled() else None + + datasets_to_close = [ds_geo, ds_main, ds_surface, ds_u, ds_v, ds_w, ds_t] + try: + movement = _load_movement(config, ds_main) + movement = _prefilter_time(movement, config, [ds_geo, ds_main]) + + ds_geo = subset_dataset_to_movement(ds_geo, movement, config) + ds_main = subset_dataset_to_movement(ds_main, movement, config) + ds_surface = subset_dataset_to_movement(ds_surface, movement, config) if ds_surface is not None else None + ds_u = subset_dataset_to_movement(ds_u, movement, config) if ds_u is not None else None + ds_v = subset_dataset_to_movement(ds_v, movement, config) if ds_v is not None else None + ds_w = subset_dataset_to_movement(ds_w, movement, config) if ds_w is not None else None + ds_t = subset_dataset_to_movement(ds_t, movement, config) if ds_t is not None else None + main_vars = _unique( + config.multilevel.continuous, + config.multilevel.categorical, + config.multilevel.variables, + ) + surface_vars = _unique( + config.surface.continuous, + config.surface.categorical, + config.surface.variables, + ) if config.surface else [] + surface_anchor_var = ( + config.surface.continuous[0] + if (config.surface and config.surface.continuous) + else None + ) + + if config.spatial_method == "nearest": + try: + sampler = _FastSampler( + ds_geo, ds_main, ds_surface, + ds_u, ds_v, ds_w, ds_t, config, + ) + except Exception as exc: + LOGGER.warning( + "FastSampler init failed (%s), falling back to XarraySampler.", + exc, + exc_info=True, + ) + sampler = _XarraySampler( + ds_geo, ds_main, ds_surface, + ds_u, ds_v, ds_w, ds_t, config, + ) + else: + sampler = _XarraySampler( + ds_geo, ds_main, ds_surface, + ds_u, ds_v, ds_w, ds_t, config, + ) + + out = movement.copy() + for col in ("terrain_elevation_m", "height_msl_m", "height_agl_m"): + out[col] = np.nan + out["annotation_warning"] = "" + for var in main_vars: + out[f"td_{var}_at_height"] = np.nan + for var in surface_vars: + out[f"surface_{var}"] = np.nan + for label, _spec in sampler.component_specs: + out[f"td_{label}_at_height"] = np.nan + + diag_rows: List[Dict[str, Any]] = [] + for idx, row in out.iterrows(): + row_diag = _process_single_point( + idx, row, sampler, config, out, + main_vars, surface_vars, surface_anchor_var, + ) + diag_rows.append(row_diag) + + if config.keep_diagnostics and diag_rows: + diag_df = pd.DataFrame(diag_rows, index=out.index) + for col in diag_df.columns: + if col not in out.columns: + out[col] = diag_df[col] + + return _finalize_and_save_annotation_output(out, config) + finally: + for ds in datasets_to_close: + try: + if ds is not None: + ds.close() + except Exception: + pass + + +def _list_or_empty(values: Optional[Sequence[str]]) -> List[str]: + return list(values or []) + + +def run_multidimensional_annotation_from_paths( + *, + movement_csv: Union[str, Path], + output_csv: Union[str, Path], + id_col: str, + time_col: str, + lat_col: str, + lon_col: str, + height_col: str, + geopotential_file: Union[str, Path], + geopotential_variable: str, + multilevel_var_file: Union[str, Path], + multilevel_variable: Optional[str] = None, + multilevel_continuous_vars: Optional[Sequence[str]] = None, + multilevel_categorical_vars: Optional[Sequence[str]] = None, + surface_var_file: Optional[Union[str, Path]] = None, + surface_variable: Optional[str] = None, + surface_continuous_vars: Optional[Sequence[str]] = None, + surface_categorical_vars: Optional[Sequence[str]] = None, + selected_ids: Optional[Sequence[str]] = None, + boundary_path: Optional[Union[str, Path]] = None, + bbox: Optional[Dict[str, float]] = None, + coord_spec: Optional[Dict[str, Optional[str]]] = None, + nc_time_var: Optional[str] = None, + nc_lat_var: Optional[str] = None, + nc_lon_var: Optional[str] = None, + nc_level_var: Optional[str] = None, + spatial_interpolation_method: str = "Nearest neighbor", + smoothing_k: int = 1, + vertical_matching_method: str = "Nearest geopotential-height level", + geopotential_units: Optional[str] = "m2 s-2", + convert_geopotential_to_height: bool = True, + use_surface_as_lower_anchor: bool = True, + surface_anchor_height_agl_m: float = 2.0, + dem_file: Optional[Union[str, Path]] = None, + save_per_individual: bool = False, + keep_diagnostics: bool = True, + height_reference: HeightReference = "ellipsoidal", + geoid_mode: GeoidMode = "geographiclib", + constant_geoid_undulation_m: float = 0.0, + geoid_grid_path: Optional[Union[str, Path]] = None, + u_file: Optional[Union[str, Path]] = None, + u_variable: Optional[str] = None, + v_file: Optional[Union[str, Path]] = None, + v_variable: Optional[str] = None, + w_file: Optional[Union[str, Path]] = None, + w_variable: Optional[str] = None, + temperature_file: Optional[Union[str, Path]] = None, + temperature_variable: Optional[str] = None, + derive_wind_speed_direction: bool = False, + derive_wind_support_crosswind: bool = False, + derive_vertical_motion: bool = False, + derive_thermal_proxy: bool = False, + derive_orographic_uplift: bool = False, + heading_col: Optional[str] = None, + heading_source: Literal["compute", "column"] = "compute", +) -> pd.DataFrame: + if coord_spec is None: + coord_spec = {"time": nc_time_var, "lat": nc_lat_var, "lon": nc_lon_var, "level": nc_level_var} + coord_spec = {k: v for k, v in (coord_spec or {}).items() if v} + + ml_cont = _list_or_empty(multilevel_continuous_vars) + ml_cat = _list_or_empty(multilevel_categorical_vars) + if not ml_cont and not ml_cat and multilevel_variable: + ml_cont = [multilevel_variable] + ml_vars = _unique(ml_cont, ml_cat) + if not ml_vars: + raise ValueError("No multilevel variables selected.") + + surf_cont = _list_or_empty(surface_continuous_vars) + surf_cat = _list_or_empty(surface_categorical_vars) + if not surf_cont and not surf_cat and surface_variable: + surf_cont = [surface_variable] + surf_vars = _unique(surf_cont, surf_cat) + + surface = None + if surface_var_file and surf_vars: + surface = DatasetSpec(surface_var_file, variables=surf_vars, continuous=surf_cont, categorical=surf_cat, label_prefix="surface") + + config = MultidimAnnotationConfig( + movement_csv=movement_csv, + output_csv=output_csv, + id_col=id_col, + time_col=time_col, + lat_col=lat_col, + lon_col=lon_col, + height_col=height_col, + selected_ids=list(selected_ids or []) if selected_ids is not None else None, + boundary_path=boundary_path, + bbox=bbox, + coord_spec=coord_spec, + geopotential_file=geopotential_file, + geopotential_variable=geopotential_variable, + geopotential_units=geopotential_units, + convert_geopotential_to_height=convert_geopotential_to_height, + multilevel=DatasetSpec(multilevel_var_file, variables=ml_vars, continuous=ml_cont, categorical=ml_cat, label_prefix="td"), + surface=surface, + spatial_method=_normalize_spatial_method(spatial_interpolation_method), + smoothing_k=int(smoothing_k or 1), + vertical_method=_normalize_vertical_method(vertical_matching_method), + use_surface_as_lower_anchor=use_surface_as_lower_anchor, + surface_height_agl_m=float(surface_anchor_height_agl_m), + dem_file=dem_file, + save_per_individual=save_per_individual, + keep_diagnostics=keep_diagnostics, + height_reference=height_reference, + geoid_mode=geoid_mode, + constant_geoid_undulation_m=float(constant_geoid_undulation_m or 0.0), + geoid_grid_path=geoid_grid_path, + u_component=OptionalComponentSpec(u_file, u_variable, "u"), + v_component=OptionalComponentSpec(v_file, v_variable, "v"), + w_component=OptionalComponentSpec(w_file, w_variable, "w"), + temperature_component=OptionalComponentSpec(temperature_file, temperature_variable, "temperature"), + derive_wind_speed_direction=derive_wind_speed_direction, + derive_wind_support_crosswind=derive_wind_support_crosswind, + derive_vertical_motion=derive_vertical_motion, + derive_thermal_proxy=derive_thermal_proxy, + derive_orographic_uplift=derive_orographic_uplift, + heading_col=heading_col, + heading_source=heading_source, + ) + return run_multidimensional_annotation(config) + + +def run_three_dim_annotation(*args: Any, **kwargs: Any) -> pd.DataFrame: + return run_multidimensional_annotation(*args, **kwargs) + + +def sample_era5_at_height(*args: Any, **kwargs: Any) -> pd.DataFrame: + raise NotImplementedError("Use run_multidimensional_annotation_from_paths() instead.") + + +__all__ = [ + "G0", + "DatasetSpec", + "OptionalComponentSpec", + "MultidimAnnotationConfig", + "parse_movebank_timestamp_series", + "open_dataset", + "sample_surface_value", + "sample_level_profile", + "sample_geopotential_profile", + "vertical_sample", + "sample_dem_elevation", + "compute_orthometric_height", + "add_track_bearing", + "add_wind_metrics", + "run_multidimensional_annotation", + "run_multidimensional_annotation_from_paths", + "run_three_dim_annotation", +] \ No newline at end of file diff --git a/ecodata/nc_builder_functions.py b/ecodata/nc_builder_functions.py new file mode 100644 index 0000000..bdc5d7d --- /dev/null +++ b/ecodata/nc_builder_functions.py @@ -0,0 +1,930 @@ +""" +Backend functions for NCBuilder_App. + +This module is intentionally UI-free: +- no Panel imports +- no ECODATA template imports +- no register_view imports + +It can be imported safely from ecodata.__init__ or from nc_builder_app.py. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple +import json +import re + +import numpy as np +import pandas as pd +import xarray as xr + + +NETCDF_EXTENSIONS = (".nc", ".nc4", ".cdf", ".netcdf") + + +@dataclass +class NCBuildConfig: + files: List[str] + combine_mode: str + target_variable: str + output_variable_name: str + lat_variable: str + lon_variable: str + # Optional multi-variable mode + target_variables: Optional[List[str]] = None + time_source: str = "From NetCDF time coordinate" + time_variable: Optional[str] = None + time_regex: str = r"(\d{8})" + time_format: str = "%Y%m%d" + time_table_path: Optional[str] = None + level_source: str = "From NetCDF coordinate" + level_variable: Optional[str] = None + level_regex: str = r"level(\d+)" + level_table_path: Optional[str] = None + output_level_coord_name: str = "level" + level_units: str = "hPa" + + bbox: Optional[Dict[str, float]] = None + start_time: Optional[str] = None + end_time: Optional[str] = None + + output_path: str = "standardized_output.nc" + use_dask_chunks: bool = True + chunking_mode: str = "auto" + manual_chunks: Optional[Dict[str, int]] = None + enable_compression: bool = True + convert_longitude_to_180: bool = True + # "auto" means: xarray default -> h5netcdf -> netcdf4 -> scipy. + open_engine: str = "auto" + #! for ECODATA/MATLAB compatibility + use_modis_time_encoding: bool = True + + +def list_netcdf_files(folder: str | Path) -> List[Path]: + folder = Path(folder).expanduser() + if not folder.exists() or not folder.is_dir(): + return [] + + allowed = {ext.lower() for ext in NETCDF_EXTENSIONS} + + files = [ + p for p in folder.iterdir() + if p.is_file() and p.suffix.lower() in allowed + ] + + return sorted(files, key=lambda p: p.name.lower()) + + +def _guess_name(candidates: Sequence[str], preferred: Sequence[str]) -> Optional[str]: + if not candidates: + return None + lower_map = {str(c).lower(): str(c) for c in candidates} + for p in preferred: + if p.lower() in lower_map: + return lower_map[p.lower()] + for c in candidates: + cl = str(c).lower() + if any(p.lower() in cl for p in preferred): + return str(c) + return None + + +def _safe_open_for_scan( + path: str | Path, + preferred_engine: Optional[str] = None, +) -> Tuple[xr.Dataset, str]: + """ + Open dataset for metadata scanning. + + For scanning, avoid chunks="auto" because it can fail when dask is not installed + and is unnecessary for reading names, dimensions, and coordinate ranges. + """ + return _open_dataset_auto( + path, + {"decode_times": True}, + preferred_engine=preferred_engine, + ) + + +def scan_netcdf_files( + files: Sequence[str | Path], + max_scan: int = 10, + use_dask_chunks: bool = False, + chunking_mode: str = "auto", + manual_chunks: Optional[Dict[str, int]] = None, +) -> Dict[str, Any]: + existing = [Path(f).expanduser() for f in files if Path(f).expanduser().exists()] + warnings: List[str] = [] + engine_by_file: Dict[str, str] = {} + + if not existing: + return { + "files": [], + "variables": [], + "coords": [], + "dims": [], + "all_names": [], + "suggested_time": None, + "suggested_lat": None, + "suggested_lon": None, + "suggested_level": None, + "time_min": None, + "time_max": None, + "scanned_count": 0, + "engine_by_file": {}, + "warnings": ["No existing NetCDF files were found."], + } + + variables = set() + coords = set() + dims = set() + time_min = None + time_max = None + + scanned_count = 0 + for path in existing[:max_scan]: + try: + ds, engine_used = _safe_open_for_scan(path) + engine_by_file[path.name] = engine_used + with ds: + scanned_count += 1 + variables.update(map(str, ds.data_vars)) + coords.update(map(str, ds.coords)) + dims.update(map(str, ds.dims)) + + all_vars = list(map(str, ds.variables)) + tname = _guess_name( + all_vars, + ["time", "valid_time", "forecast_time", "verification_time", "datetime", "date", "t", "Time"], + ) + if tname and tname in ds.variables: + cur_min, cur_max, calendar_info = _safe_time_range(ds[tname].values) + if cur_min is not None and cur_max is not None: + if time_min is None or str(cur_min) < str(time_min): + time_min = cur_min + if time_max is None or str(cur_max) > str(time_max): + time_max = cur_max + + if calendar_info: + warnings.append( + f"Time in {path.name} uses non-pandas calendar/time type " + f"`{calendar_info}`; preview time range is shown as string." + ) + except Exception as exc: + warnings.append(f"Could not scan {path.name}: {exc}") + + all_names = sorted(variables | coords | dims) + + suggested_time = _guess_name( + all_names, + ["time", "valid_time", "forecast_time", "verification_time", "datetime", "date", "t", "Time"], + ) + suggested_lat = _guess_name(all_names, ["lat", "latitude", "Latitude", "y"]) + suggested_lon = _guess_name(all_names, ["lon", "longitude", "Longitude", "long", "x"]) + suggested_level = _guess_name( + all_names, + ["level", "pressure_level", "isobaricInhPa", "isobaric_in_hPa", "plev", "lev", "height", "altitude"], + ) + + return { + "files": [str(f) for f in existing], + "variables": sorted(variables), + "coords": sorted(coords), + "dims": sorted(dims), + "all_names": all_names, + "suggested_time": suggested_time, + "suggested_lat": suggested_lat, + "suggested_lon": suggested_lon, + "suggested_level": suggested_level, + "time_min": str(time_min) if time_min is not None else None, + "time_max": str(time_max) if time_max is not None else None, + "scanned_count": scanned_count, + "warnings": warnings, + "engine_by_file": engine_by_file + } + +def _open_dataset_auto( + path: str | Path, + open_kwargs: Optional[Dict[str, Any]] = None, + preferred_engine: Optional[str] = None, +) -> Tuple[xr.Dataset, str]: + """ + Open a NetCDF file with automatic engine fallback. + + Engine strategy: + - preferred_engine, if explicitly provided and not "auto"/"default"; + - xarray default engine; + - h5netcdf; + - netcdf4; + - scipy. + + Returns + ------- + ds : xr.Dataset + Opened dataset. + engine_used : str + Engine name used for opening. "default" means xarray default engine. + """ + path = Path(path).expanduser() + open_kwargs = dict(open_kwargs or {}) + + preferred_engine = preferred_engine or "auto" + + engines: List[Optional[str]] = [] + + if preferred_engine not in ("auto", "default", None): + engines.append(str(preferred_engine)) + + engines.extend([None, "h5netcdf", "netcdf4", "scipy"]) + + tried: List[str] = [] + last_exc: Optional[Exception] = None + + for engine in engines: + engine_label = engine or "default" + if engine_label in tried: + continue + tried.append(engine_label) + + kwargs = dict(open_kwargs) + if engine is not None: + kwargs["engine"] = engine + else: + kwargs.pop("engine", None) + + try: + ds = xr.open_dataset(path, **kwargs) + return ds, engine_label + except Exception as exc: + last_exc = exc + + raise OSError( + f"Could not open NetCDF file {path.name!r}. " + f"Tried engines: {tried}. Last error: {last_exc}" + ) + +def validate_build_config(config: NCBuildConfig) -> Tuple[bool, List[str], List[str]]: + errors: List[str] = [] + warnings: List[str] = [] + + files = [Path(f).expanduser() for f in config.files] + existing = [f for f in files if f.exists()] + if not existing: + errors.append("No existing NetCDF files were selected.") + + selected_vars = _selected_target_variables(config) + if not selected_vars: + errors.append("Target variable is not selected.") + + # output_variable_name is required only in single-variable mode. + # In multi-variable mode original source variable names are preserved. + if len(selected_vars) == 1 and not config.output_variable_name: + errors.append("Output variable name is empty.") + + if not config.lat_variable: + errors.append("Latitude variable is not selected.") + + if not config.lon_variable: + errors.append("Longitude variable is not selected.") + + if config.time_source == "From NetCDF time coordinate" and not config.time_variable: + errors.append("Time source is NetCDF coordinate, but no time variable is selected.") + + if config.time_source == "From filename": + if not config.time_regex: + errors.append("Time source is filename, but time regex is empty.") + if not config.time_format: + errors.append("Time source is filename, but time format is empty.") + + if config.time_source == "Manual table" and not config.time_table_path: + errors.append("Time source is manual table, but no time table file is selected.") + + if config.combine_mode in ("By level", "By time and level"): + if config.level_source == "From NetCDF coordinate" and not config.level_variable: + errors.append("Combine mode requires level handling, but no level variable is selected.") + if config.level_source == "From filename" and not config.level_regex: + errors.append("Level source is filename, but level regex is empty.") + if config.level_source == "Manual table" and not config.level_table_path: + errors.append("Level source is manual table, but no level table file is selected.") + + if config.bbox is not None: + try: + south = float(config.bbox["south"]) + north = float(config.bbox["north"]) + west = float(config.bbox["west"]) + east = float(config.bbox["east"]) + if south >= north: + errors.append("Bounding box is invalid: South must be smaller than North.") + if west >= east: + errors.append("Bounding box is invalid: West must be smaller than East.") + except Exception: + errors.append("Bounding box is enabled but contains invalid values.") + + output_path = Path(config.output_path).expanduser() + if not output_path.name: + errors.append("Output filename is empty.") + if output_path.suffix.lower() not in (".nc", ".nc4"): + warnings.append("Output file does not end with .nc or .nc4.") + + valid_engines = {"auto", "default", "h5netcdf", "netcdf4", "scipy"} + if config.open_engine not in valid_engines: + errors.append( + f"Invalid open_engine={config.open_engine!r}. " + f"Expected one of: {sorted(valid_engines)}." + ) + + if config.open_engine == "auto": + warnings.append( + "NetCDF open engine will be selected automatically: default -> h5netcdf -> netcdf4 -> scipy." + ) + else: + warnings.append(f"NetCDF open engine preference: {config.open_engine}.") + + if config.convert_longitude_to_180: + warnings.append("Longitudes will be converted to the -180..180 convention when possible.") + if config.use_modis_time_encoding: + warnings.append("Time coordinate will use MATLAB/MODIS-compatible encoding when possible.") + return len(errors) == 0, errors, warnings + + +def _load_lookup_table(path: Optional[str], value_col: str) -> Dict[str, Any]: + if not path: + return {} + table_path = Path(path).expanduser() + if not table_path.exists(): + raise FileNotFoundError(f"Manual table not found: {table_path}") + + df = pd.read_csv(table_path) + if "name" not in df.columns or value_col not in df.columns: + raise ValueError(f"Manual table must contain columns: name, {value_col}") + + return {str(row["name"]): row[value_col] for _, row in df.iterrows()} + + +def _lookup_by_filename(path: Path, lookup: Dict[str, Any]) -> Optional[Any]: + if not lookup: + return None + name = path.name + for key, value in lookup.items(): + if str(key) == name or str(key) in name: + return value + return None + + +def _parse_from_filename(path: Path, regex: str, cast=float, time_format: Optional[str] = None) -> Any: + m = re.search(regex, path.name) + if not m: + raise ValueError(f"Pattern {regex!r} did not match file name {path.name!r}") + token = m.group(1) if m.groups() else m.group(0) + if time_format: + return pd.to_datetime(token, format=time_format) + return cast(token) + + +def _rename_if_needed(ds: xr.Dataset, old: Optional[str], new: str) -> xr.Dataset: + if not old or old == "None": + return ds + if old == new: + return ds + if old in ds.variables or old in ds.dims or old in ds.coords: + return ds.rename({old: new}) + return ds + +def _normalize_time_coord_if_possible(ds: xr.Dataset) -> xr.Dataset: + """ + Convert the time coordinate to pandas datetime only when this is safe. + + Standard calendars are usually convertible to pandas datetime64. + Non-standard calendars, such as julian, noleap, or 360_day, may be decoded + by xarray as cftime objects. pandas.to_datetime() cannot convert them + reliably, so they are preserved unchanged. + """ + if "time" not in ds.coords: + return ds + + values = ds["time"].values + + try: + converted = pd.to_datetime(values) + except Exception: + return ds + + return ds.assign_coords(time=converted) + +def _safe_time_range(values): + """ + Return time min/max for preview without failing on cftime calendars. + + pandas can handle standard datetime-like values, but not all cftime calendars + such as Julian, noleap, or 360_day. For cftime objects, use native min/max + and convert to string for display. + """ + if values is None or len(values) == 0: + return None, None, None + + try: + vals = pd.to_datetime(values) + if len(vals) == 0: + return None, None, None + return pd.Timestamp(vals.min()), pd.Timestamp(vals.max()), None + except Exception: + try: + cur_min = min(values) + cur_max = max(values) + calendar_type = type(values[0]).__name__ + return cur_min, cur_max, calendar_type + except Exception as exc: + return None, None, f"unreadable time values: {exc}" + +def _subset_bbox_1d_coords( + ds: xr.Dataset, + bbox: Dict[str, float], + source_name: str = "dataset", +) -> xr.Dataset: + """ + Subset a dataset by bbox using 1D lat/lon coordinate values. + + This does not require lat/lon to be xarray index coordinates. + It works when lat and lon are 1D coordinates, even if their dimension names + are not exactly 'lat' and 'lon'. + """ + if "lat" not in ds.coords or "lon" not in ds.coords: + return ds + + lat = ds["lat"] + lon = ds["lon"] + + if lat.ndim != 1 or lon.ndim != 1: + raise ValueError( + f"Bounding box subset currently supports only 1D lat/lon coordinates in {source_name}. " + f"Got lat.ndim={lat.ndim}, lon.ndim={lon.ndim}." + ) + + south = float(bbox["south"]) + north = float(bbox["north"]) + west = float(bbox["west"]) + east = float(bbox["east"]) + + lat_dim = lat.dims[0] + lon_dim = lon.dims[0] + + lat_values = np.asarray(lat.values) + lon_values = np.asarray(lon.values) + + lat_mask = (lat_values >= south) & (lat_values <= north) + lon_mask = (lon_values >= west) & (lon_values <= east) + + lat_idx = np.where(lat_mask)[0] + lon_idx = np.where(lon_mask)[0] + + if lat_idx.size == 0: + raise ValueError( + f"Bounding box produced an empty latitude subset in {source_name}. " + f"Requested south/north=({south}, {north}); " + f"available lat range=({float(np.nanmin(lat_values))}, {float(np.nanmax(lat_values))})." + ) + + if lon_idx.size == 0: + raise ValueError( + f"Bounding box produced an empty longitude subset in {source_name}. " + f"Requested west/east=({west}, {east}); " + f"available lon range=({float(np.nanmin(lon_values))}, {float(np.nanmax(lon_values))})." + ) + + return ds.isel({ + lat_dim: lat_idx, + lon_dim: lon_idx, + }) + +def _json_safe(value: Any) -> Any: + """ + Convert common numpy/pandas/xarray/cftime objects to JSON-safe values. + + This is mainly used for writing the manifest file. NetCDF encodings may + contain numpy dtypes or other objects that json.dump cannot serialize. + """ + if value is None: + return None + + if isinstance(value, (str, int, float, bool)): + return value + + if isinstance(value, Path): + return str(value) + + if isinstance(value, dict): + return {str(k): _json_safe(v) for k, v in value.items()} + + if isinstance(value, (list, tuple, set)): + return [_json_safe(v) for v in value] + + if isinstance(value, np.generic): + return value.item() + + if isinstance(value, np.dtype): + return str(value) + + if isinstance(value, pd.Timestamp): + return value.isoformat() + + # Handles pandas/numpy extension dtypes such as Float64DType. + if hasattr(value, "name") and value.__class__.__name__.endswith("DType"): + return str(value) + + # Handles cftime objects and any remaining non-JSON-native objects. + return str(value) + +def _selected_target_variables(config: NCBuildConfig) -> List[str]: + """ + Return target variables selected for output. + + New multi-variable mode uses config.target_variables. + Legacy single-variable mode uses config.target_variable. + """ + selected = [ + str(v) for v in (config.target_variables or []) + if v not in (None, "", "None") + ] + + if selected: + # Preserve order and remove duplicates. + unique: List[str] = [] + seen = set() + for v in selected: + if v not in seen: + seen.add(v) + unique.append(v) + return unique + + if config.target_variable not in (None, "", "None"): + return [str(config.target_variable)] + + return [] + +def _standardize_one_dataset( + path: Path, + config: NCBuildConfig, + time_lookup: Dict[str, Any], + level_lookup: Dict[str, Any], +) -> xr.Dataset: + open_kwargs = {"decode_times": True} + if config.use_dask_chunks: + if config.chunking_mode == "auto": + open_kwargs["chunks"] = "auto" + elif config.manual_chunks: + # Only keep positive chunks; xarray will ignore unknown dims poorly, + # so this is applied later after dims are known if needed. + open_kwargs["chunks"] = {k: int(v) for k, v in config.manual_chunks.items() if int(v) > 0} + + try: + ds, engine_used = _open_dataset_auto( + path, + open_kwargs, + preferred_engine=config.open_engine, + ) + except Exception: + # Fallback without dask/chunks. + ds, engine_used = _open_dataset_auto( + path, + {"decode_times": True}, + preferred_engine=config.open_engine, + ) + + selected_vars = _selected_target_variables(config) + missing_vars = [ + var for var in selected_vars + if var not in ds.data_vars and var not in ds.variables + ] + + if missing_vars: + ds.close() + raise ValueError( + f"Target variable(s) {missing_vars!r} not found in {path.name}" + ) + + # Rename coordinates/dims to ECODATA/CF-style names. + ds = _rename_if_needed(ds, config.lat_variable, "lat") + ds = _rename_if_needed(ds, config.lon_variable, "lon") + + if config.time_source == "From NetCDF time coordinate": + ds = _rename_if_needed(ds, config.time_variable, "time") + + if config.level_source == "From NetCDF coordinate" and config.level_variable not in (None, "", "None"): + ds = _rename_if_needed(ds, config.level_variable, "level") + + # Build output dataset. + # Single-variable mode preserves the output_variable_name behaviour. + # Multi-variable mode keeps original variable names to avoid ambiguous renaming. + if len(selected_vars) == 1: + old_name = selected_vars[0] + new_name = config.output_variable_name or old_name + + da = ds[old_name] + if new_name != old_name: + da = da.rename(new_name) + + out = da.to_dataset() + else: + out = ds[selected_vars].copy() + + out.attrs["source_open_engine"] = engine_used + out.attrs["source_file"] = str(path) + + # Add time if it comes from filename/table and is not already a dimension. + if config.time_source == "From filename": + t = _parse_from_filename(path, config.time_regex, time_format=config.time_format) + if "time" not in out.dims: + out = out.expand_dims(time=[pd.Timestamp(t)]) + else: + out = out.assign_coords(time=pd.to_datetime(out["time"].values)) + elif config.time_source == "Manual table": + value = _lookup_by_filename(path, time_lookup) + if value is None: + raise ValueError(f"No DateTime entry found in time table for {path.name}") + t = pd.to_datetime(value) + if "time" not in out.dims: + out = out.expand_dims(time=[pd.Timestamp(t)]) + elif "time" in out.coords: + out = _normalize_time_coord_if_possible(out) + + # Add level if it comes from filename/table and is not already a dimension. + if config.level_source == "From filename": + level_value = _parse_from_filename(path, config.level_regex, cast=float) + if "level" not in out.dims: + out = out.expand_dims(level=[level_value]) + elif config.level_source == "Manual table": + value = _lookup_by_filename(path, level_lookup) + if value is None: + raise ValueError(f"No level entry found in level table for {path.name}") + level_value = float(value) + if "level" not in out.dims: + out = out.expand_dims(level=[level_value]) + + # Keep only expected data + standard coords where possible. + if "lat" not in out.variables and "lat" not in out.coords: + raise ValueError(f"Could not standardize latitude coordinate in {path.name}") + if "lon" not in out.variables and "lon" not in out.coords: + raise ValueError(f"Could not standardize longitude coordinate in {path.name}") + + # Convert lon 0..360 to -180..180 when lon is 1D. + if config.convert_longitude_to_180 and "lon" in out.coords: + lon = out["lon"] + try: + if lon.ndim == 1 and float(lon.max()) > 180: + new_lon = ((lon + 180) % 360) - 180 + out = out.assign_coords(lon=new_lon).sortby("lon") + except Exception: + pass + + # Sort common dims. + for dim in ("time", "level", "lat", "lon"): + if dim in out.coords: + try: + out = out.sortby(dim) + except Exception: + pass + + # Spatial subset by bbox. + if config.bbox: + out = _subset_bbox_1d_coords( + out, + config.bbox, + source_name=path.name, + ) + + # Time subset. + # In "By time" mode, the selected input files define the time range. + # This avoids unsafe pandas Timestamp slicing for cftime calendars. + if ( + config.combine_mode != "By time" + and "time" in out.coords + and (config.start_time or config.end_time) + ): + time_values = out["time"].values + first_time = time_values[0] if len(time_values) else None + + if first_time is not None and first_time.__class__.__module__.startswith("cftime"): + # Skip cftime slicing until a dedicated cftime-aware subset is implemented. + pass + else: + start = pd.to_datetime(config.start_time) if config.start_time else None + end = pd.to_datetime(config.end_time) if config.end_time else None + out = out.sel(time=slice(start, end)) + + return out + + +def _check_grid_compatibility(datasets: Sequence[xr.Dataset]) -> None: + if not datasets: + raise ValueError("No datasets to combine.") + + ref = datasets[0] + for coord in ("lat", "lon"): + if coord not in ref.coords: + continue + ref_vals = ref[coord].values + for i, ds in enumerate(datasets[1:], start=2): + if coord not in ds.coords: + raise ValueError(f"Dataset #{i} is missing coordinate {coord!r}") + vals = ds[coord].values + if ref_vals.shape != vals.shape or not np.allclose(ref_vals, vals, equal_nan=True): + raise ValueError( + f"Grid incompatibility for coordinate {coord!r}: " + f"dataset #1 shape {ref_vals.shape}, dataset #{i} shape {vals.shape}" + ) + + +def _combine_datasets(datasets: Sequence[xr.Dataset], config: NCBuildConfig) -> xr.Dataset: + _check_grid_compatibility(datasets) + + try: + combined = xr.combine_by_coords(list(datasets), combine_attrs="override") + except Exception: + # Fallback based on selected mode. + if config.combine_mode == "By time": + combined = xr.concat(list(datasets), dim="time", combine_attrs="override") + elif config.combine_mode == "By level": + combined = xr.concat(list(datasets), dim="level", combine_attrs="override") + else: + # combine_by_coords is the safer option for time+level; + # if it failed, the layouts are probably ambiguous. + raise + + for dim in ("time", "level", "lat", "lon"): + if dim in combined.coords: + try: + combined = combined.sortby(dim) + except Exception: + pass + + return combined + + +def _apply_cf_metadata(ds: xr.Dataset, config: NCBuildConfig) -> xr.Dataset: + if "lat" in ds.coords: + ds["lat"].attrs.update({ + "standard_name": "latitude", + "long_name": "latitude", + "units": "degrees_north", + "axis": "Y", + }) + + if "lon" in ds.coords: + ds["lon"].attrs.update({ + "standard_name": "longitude", + "long_name": "longitude", + "units": "degrees_east", + "axis": "X", + }) + + if "time" in ds.coords: + ds["time"].attrs.update({ + "standard_name": "time", + "long_name": "time", + "axis": "T", + }) + + if "level" in ds.coords: + attrs = { + "long_name": "vertical level", + "axis": "Z", + "units": config.level_units, + } + if config.level_units in ("hPa", "Pa"): + attrs.update({ + "standard_name": "air_pressure", + "positive": "down", + }) + elif config.level_units == "m": + attrs.update({ + "standard_name": "height", + "positive": "up", + }) + ds["level"].attrs.update(attrs) + + ds.attrs.update({ + "title": "ECODATA standardized NetCDF", + "Conventions": "CF-1.8", + "history": f"Created by ECODATA NCBuilder", + "source_files_count": len(config.files), + "combine_mode": config.combine_mode, + }) + + return ds + +def _apply_time_encoding(ds: xr.Dataset, config: NCBuildConfig) -> xr.Dataset: + """ + Apply optional time encoding for ECODATA/MATLAB compatibility. + + This does not change the actual time coordinate values in memory. + It only controls how the time coordinate is written to the NetCDF file. + """ + if not config.use_modis_time_encoding: + return ds + + if "time" not in ds.coords: + return ds + + ds["time"].encoding.update({ + "units": "days since 2000-01-01", + "calendar": "julian", + }) + + return ds + +def _encoding_for(ds: xr.Dataset, config: NCBuildConfig) -> Dict[str, Dict[str, Any]]: + encoding: Dict[str, Dict[str, Any]] = {} + + # Preserve explicit time encoding if it was set by _apply_time_encoding(). + if "time" in ds.coords and ds["time"].encoding: + time_encoding = {} + for key in ("units", "calendar", "dtype"): + if key in ds["time"].encoding: + time_encoding[key] = ds["time"].encoding[key] + if time_encoding: + encoding["time"] = time_encoding + + if not config.enable_compression: + return encoding + + for var in ds.data_vars: + encoding[var] = { + "zlib": True, + "complevel": 4, + } + + return encoding + + +def build_standardized_netcdf(config: NCBuildConfig) -> Dict[str, Any]: + ok, errors, warnings = validate_build_config(config) + if not ok: + raise ValueError("Invalid NCBuildConfig: " + "; ".join(errors)) + + output_path = Path(config.output_path).expanduser() + output_path.parent.mkdir(parents=True, exist_ok=True) + + time_lookup = _load_lookup_table(config.time_table_path, "DateTime") if config.time_source == "Manual table" else {} + level_lookup = _load_lookup_table(config.level_table_path, "level") if config.level_source == "Manual table" else {} + + datasets: List[xr.Dataset] = [] + processed_files: List[str] = [] + + for f in config.files: + path = Path(f).expanduser() + if not path.exists(): + continue + ds = _standardize_one_dataset(path, config, time_lookup, level_lookup) + datasets.append(ds) + processed_files.append(str(path)) + + if not datasets: + raise ValueError("No datasets were successfully opened.") + + combined = _combine_datasets(datasets, config) + combined = _apply_cf_metadata(combined, config) + combined = _apply_time_encoding(combined, config) + + encoding = _encoding_for(combined, config) + combined.to_netcdf(output_path, encoding=encoding) + + engine_by_file = {} + for ds in datasets: + source_file = ds.attrs.get("source_file") + source_engine = ds.attrs.get("source_open_engine") + if source_file and source_engine: + engine_by_file[Path(source_file).name] = source_engine + + # Close source datasets to release file handles. + for ds in datasets: + try: + ds.close() + except Exception: + pass + + manifest_path = output_path.with_suffix(output_path.suffix + ".manifest.json") + manifest = { + "output_path": str(output_path), + "manifest_path": str(manifest_path), + "processed_files": _json_safe(processed_files), + "engine_by_file": _json_safe(engine_by_file), + "config": _json_safe(asdict(config)), + "warnings": _json_safe(warnings), + "output_dims": _json_safe({k: int(v) for k, v in combined.sizes.items()}), + "output_variables": _json_safe(list(map(str, combined.data_vars))), + "output_coords": _json_safe(list(map(str, combined.coords))), + "time_encoding": _json_safe(dict(combined["time"].encoding)) if "time" in combined.coords else {}, + } + + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2, ensure_ascii=False, default=str) + + try: + combined.close() + except Exception: + pass + + return manifest diff --git a/ecodata/presence_functions.py b/ecodata/presence_functions.py new file mode 100644 index 0000000..881ecc0 --- /dev/null +++ b/ecodata/presence_functions.py @@ -0,0 +1,934 @@ +""" +presence data preparation backend functions. + +- VettingOptions +- AggregationOptions +- aggregate_ebird_to_files +- export_tracks_from_aggregated_counts +- read_species_from_agg_counts +""" + +from __future__ import annotations + +import datetime as dt +import gzip +import io +import json +import os +import tempfile +import zipfile +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence + +import numpy as np +import pandas as pd + +try: + import geopandas as gpd + from shapely.geometry import Point, box +except Exception: # pragma: no cover + gpd = None + Point = None + box = None + + +@dataclass +class VettingOptions: + """ + Vetting/filter options for eBird EBD + Sampling Event data. + + UI mapping: + - require_reviewed: filters by REVIEWED when present + - require_approved: filters by APPROVED when present + - require_all_species_reported: filters by ALL SPECIES REPORTED when present + - allowed_protocols: matches PROTOCOL TYPE (preferred) or PROTOCOL CODE if present + - exclude_incidental_historical: excludes Incidental/Historical when PROTOCOL TYPE present + - duration/distance bounds: applied when sampling effort fields are present + - require_valid_coords: removes rows with missing/invalid lat/lon + - clip_counts_above: clips numeric counts after parsing; 0 disables clipping + """ + + require_reviewed: bool = False + require_approved: bool = False + require_all_species_reported: bool = False + + allowed_protocols: Optional[List[str]] = None + exclude_incidental_historical: bool = True + + duration_min_minutes: int = 0 + duration_max_minutes: int = 600 + + distance_min_km: float = 0.0 + distance_max_km: float = 50.0 + + require_valid_coords: bool = True + + clip_counts_above: int = 0 + + +@dataclass +class AggregationOptions: + """ + Time aggregation options. + + Aggregation is performed in bins of N days starting from start_date. + + Spatial aggregation: + - grid_step_deg == 0: keep original observation coordinates + - grid_step_deg > 0: assign observations to regular lon/lat grid nodes + + Notes: + - treat_x_as_one: if True, OBSERVATION COUNT == 'X' is treated as 1. + If False, 'X' is treated as missing and then filled to 1.0 for presence-like behavior. + """ + + start_date: dt.date + end_date: dt.date + step_days: int = 7 + grid_step_deg: float = 0.0 + treat_x_as_one: bool = True + + +def _truthy(series: pd.Series) -> pd.Series: + """ + Interpret typical eBird truthy values. + """ + s = series.fillna("").astype(str).str.strip().str.upper() + return s.isin(["1", "TRUE", "T", "YES", "Y"]) + + +def _read_bytes_table(file_bytes: bytes) -> pd.DataFrame: + """ + Read EBD/Sampling tables from bytes. + + Supports: + - TSV (tab-separated) plain + - gzip-compressed TSV + - zip containing a TSV/TXT/CSV + + Drops any 'Unnamed:*' columns. + """ + if not file_bytes: + raise ValueError("Empty file bytes.") + + # ZIP container + if zipfile.is_zipfile(io.BytesIO(file_bytes)): + with tempfile.TemporaryDirectory() as td: + zp = os.path.join(td, "f.zip") + with open(zp, "wb") as f: + f.write(file_bytes) + + with zipfile.ZipFile(zp, "r") as zf: + names = zf.namelist() + cand = [n for n in names if n.lower().endswith((".txt", ".tsv", ".csv"))] + if not cand: + raise ValueError("ZIP does not contain a .txt/.tsv/.csv table.") + target = cand[0] + with zf.open(target) as zfh: + raw = zfh.read() + return _read_bytes_table(raw) + + # GZIP container + if file_bytes[:2] == b"\x1f\x8b": + try: + raw = gzip.decompress(file_bytes) + except Exception as e: + raise ValueError(f"Failed to decompress gzip content: {e}") from e + return _read_bytes_table(raw) + + # Plain text table: try TSV then CSV + bio = io.BytesIO(file_bytes) + try: + df = pd.read_csv(bio, sep="\t", dtype=str, low_memory=False) + except Exception: + bio.seek(0) + df = pd.read_csv(bio, sep=",", dtype=str, low_memory=False) + + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + +def _read_path_table(path: str) -> pd.DataFrame: + """ + Read EBD/Sampling tables from a local filesystem path. + + Supports: + - plain TSV/CSV + - .gz + - .zip containing a .txt/.tsv/.csv table + + Drops any 'Unnamed:*' columns. + """ + if not path: + raise ValueError("Empty file path.") + if not os.path.exists(path): + raise ValueError(f"File does not exist: {path}") + + lower = path.lower() + + # ZIP container + if lower.endswith(".zip"): + with zipfile.ZipFile(path, "r") as zf: + names = zf.namelist() + cand = [n for n in names if n.lower().endswith((".txt", ".tsv", ".csv"))] + if not cand: + raise ValueError("ZIP does not contain a .txt/.tsv/.csv table.") + target = cand[0] + with zf.open(target) as zfh: + try: + df = pd.read_csv(zfh, sep="\t", dtype=str, low_memory=False) + except Exception: + zfh.close() + with zf.open(target) as zfh2: + df = pd.read_csv(zfh2, sep=",", dtype=str, low_memory=False) + + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + + # GZIP container + if lower.endswith(".gz"): + try: + df = pd.read_csv(path, sep="\t", dtype=str, low_memory=False, compression="gzip") + except Exception: + df = pd.read_csv(path, sep=",", dtype=str, low_memory=False, compression="gzip") + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + + # Plain text table + try: + df = pd.read_csv(path, sep="\t", dtype=str, low_memory=False) + except Exception: + df = pd.read_csv(path, sep=",", dtype=str, low_memory=False) + + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + + +def _read_table_input(table_input: Any) -> pd.DataFrame: + """ + Read table either from bytes (old FileInput workflow) or from local path. + """ + if isinstance(table_input, (str, os.PathLike)): + return _read_path_table(os.fspath(table_input)) + return _read_bytes_table(table_input) + +def _ensure_cols(df: pd.DataFrame, cols: Sequence[str], label: str) -> None: + missing = [c for c in cols if c not in df.columns] + if missing: + raise ValueError(f"Missing required columns in {label}: {missing}") + + +def _load_polygon(polygon_source: Any, filename_hint: str) -> "gpd.GeoDataFrame": + """ + Load polygon from a local path or bytes. Supports: + - zipped shapefile (.zip) + - GeoJSON / JSON + Returns GeoDataFrame in EPSG:4326. + """ + if gpd is None: + raise ImportError("geopandas is required for polygon operations.") + + if isinstance(polygon_source, (str, os.PathLike)): + path = os.fspath(polygon_source) + if not os.path.exists(path): + raise ValueError(f"Polygon file does not exist: {path}") + if not os.path.isfile(path): + raise ValueError(f"Polygon path is not a file: {path}") + + lower = path.lower() + if lower.endswith(".zip"): + with tempfile.TemporaryDirectory() as td: + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(td) + + shp = None + for root, _dirs, files in os.walk(td): + for fn in files: + if fn.lower().endswith(".shp"): + shp = os.path.join(root, fn) + break + if shp: + break + if not shp: + raise ValueError("Polygon ZIP does not contain a .shp file.") + poly = gpd.read_file(shp) + else: + poly = gpd.read_file(path) + else: + polygon_bytes = polygon_source + with tempfile.TemporaryDirectory() as td: + if (filename_hint or "").lower().endswith(".zip") or zipfile.is_zipfile(io.BytesIO(polygon_bytes)): + zp = os.path.join(td, "poly.zip") + with open(zp, "wb") as f: + f.write(polygon_bytes) + with zipfile.ZipFile(zp, "r") as zf: + zf.extractall(td) + + shp = None + for root, _dirs, files in os.walk(td): + for fn in files: + if fn.lower().endswith(".shp"): + shp = os.path.join(root, fn) + break + if shp: + break + if not shp: + raise ValueError("Polygon ZIP does not contain a .shp file.") + poly = gpd.read_file(shp) + else: + fp = os.path.join(td, "poly.geojson") + with open(fp, "wb") as f: + f.write(polygon_bytes) + poly = gpd.read_file(fp) + + if poly.empty: + raise ValueError("Polygon contains no features.") + + if poly.crs is None: + poly = poly.set_crs("EPSG:4326") + else: + poly = poly.to_crs("EPSG:4326") + + return poly + +def _load_bbox_polygon(bbox: Sequence[float]) -> "gpd.GeoDataFrame": + """ + Build polygon GeoDataFrame from bbox: + (west, south, east, north) in EPSG:4326. + """ + if gpd is None or box is None: + raise ImportError("geopandas + shapely are required for bbox operations.") + + if bbox is None or len(bbox) != 4: + raise ValueError("BBox must contain exactly 4 values: west, south, east, north.") + + west, south, east, north = [float(v) for v in bbox] + + if not (-180 <= west <= 180 and -180 <= east <= 180): + raise ValueError("Invalid bbox: longitude must be between -180 and 180.") + if not (-90 <= south <= 90 and -90 <= north <= 90): + raise ValueError("Invalid bbox: latitude must be between -90 and 90.") + if west >= east: + raise ValueError("Invalid bbox: west must be smaller than east.") + if south >= north: + raise ValueError("Invalid bbox: south must be smaller than north.") + + geom = box(west, south, east, north) + return gpd.GeoDataFrame({"name": ["bbox_region"]}, geometry=[geom], crs="EPSG:4326") + + +def _resolve_spatial_filter( + polygon_bytes: Optional[Any] = None, + polygon_filename_hint: Optional[str] = None, + bbox: Optional[Sequence[float]] = None, +) -> "gpd.GeoDataFrame": + """ + Resolve spatial filter source into a single polygon GeoDataFrame in EPSG:4326. + + Exactly one of: + - polygon_bytes/path + - bbox + must be provided. + """ + has_polygon = bool(polygon_bytes) + has_bbox = bbox is not None + + if has_polygon and has_bbox: + raise ValueError("Provide either polygon_bytes or bbox, not both.") + if not has_polygon and not has_bbox: + raise ValueError("Provide either polygon_bytes or bbox.") + + if has_polygon: + return _load_polygon(polygon_bytes, polygon_filename_hint or "") + return _load_bbox_polygon(bbox) + + + +def _parse_obs_datetime(df: pd.DataFrame) -> pd.Series: + """ + Parse timestamp from OBSERVATION DATE and TIME OBSERVATIONS STARTED. + """ + d_str = df["OBSERVATION DATE"].fillna("").astype(str).str.strip() + t_str = df.get("TIME OBSERVATIONS STARTED", pd.Series([""] * len(df))).fillna("").astype(str).str.strip() + dt_full = np.where(t_str != "", d_str + " " + t_str, d_str) + return pd.to_datetime(dt_full, errors="coerce") + + +def _parse_counts(df: pd.DataFrame, treat_x_as_one: bool) -> pd.Series: + """ + Parse OBSERVATION COUNT as numeric; supports 'X' for unknown counts. + """ + raw = df["OBSERVATION COUNT"].fillna("").astype(str).str.strip().str.upper() + if treat_x_as_one: + raw = raw.replace({"X": "1"}) + num = pd.to_numeric(raw, errors="coerce") + return num + + +def _normalize_protocol_values(values: Optional[List[str]]) -> Optional[set]: + """ + Normalize protocol values for matching. + """ + if not values: + return None + return {str(v).strip() for v in values if str(v).strip()} + + +def _apply_vetting(m: pd.DataFrame, vet: VettingOptions) -> pd.DataFrame: + """ + Apply vetting filters to merged observations+sampling dataframe. + """ + out = m.copy() + + # REVIEWED / APPROVED / ALL SPECIES REPORTED (AND logic if multiple are True) + if vet.require_reviewed and "REVIEWED" in out.columns: + out = out[_truthy(out["REVIEWED"])] + + if vet.require_approved and "APPROVED" in out.columns: + out = out[_truthy(out["APPROVED"])] + + if vet.require_all_species_reported and "ALL SPECIES REPORTED" in out.columns: + out = out[_truthy(out["ALL SPECIES REPORTED"])] + + # Protocol filtering (optional) + allowed = _normalize_protocol_values(vet.allowed_protocols) + + if allowed: + allowed_norm = {str(a).strip() for a in allowed if str(a).strip()} + + # 1) PROTOCOL TYPE (preferred) + if "PROTOCOL TYPE" in out.columns: + out = out[out["PROTOCOL TYPE"].astype(str).str.strip().isin(allowed_norm)] + + # 2) PROTOCOL NAME (common in sampling file) + elif "PROTOCOL NAME" in out.columns: + out = out[out["PROTOCOL NAME"].astype(str).str.strip().isin(allowed_norm)] + + # 3) OBSERVATION TYPE (common in EBD) + elif "OBSERVATION TYPE" in out.columns: + out = out[out["OBSERVATION TYPE"].astype(str).str.strip().isin(allowed_norm)] + + # 4) Fallback to PROTOCOL CODE only if UI supplies codes + elif "PROTOCOL CODE" in out.columns: + allowed_u = {a.upper() for a in allowed_norm} + out = out[out["PROTOCOL CODE"].astype(str).str.strip().str.upper().isin(allowed_u)] + + + # Exclude incidental/historical (optional) + if vet.exclude_incidental_historical and "PROTOCOL TYPE" in out.columns: + bad = {"Incidental", "Historical"} + out = out[~out["PROTOCOL TYPE"].astype(str).str.strip().isin(bad)] + + # Duration bounds (optional) + if "DURATION MINUTES" in out.columns: + dur = pd.to_numeric(out["DURATION MINUTES"], errors="coerce") + out = out[(dur.isna()) | ((dur >= vet.duration_min_minutes) & (dur <= vet.duration_max_minutes))] + + # Distance bounds (optional) + if "EFFORT DISTANCE KM" in out.columns: + dist = pd.to_numeric(out["EFFORT DISTANCE KM"], errors="coerce") + out = out[(dist.isna()) | ((dist >= vet.distance_min_km) & (dist <= vet.distance_max_km))] + + # Coordinates + if vet.require_valid_coords: + out = out[out["latitude"].notna() & out["longitude"].notna()] + + # Require valid timestamp + out = out[out["__dt"].notna()] + return out + +def _write_manifest(path: str, payload: Dict[str, Any]) -> None: + """ + Write JSON manifest to disk. + """ + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, ensure_ascii=False, indent=2) + +def _assign_n_day_bins( + datetimes: pd.Series, + start_date: dt.date, + end_date: dt.date, + step_days: int, +) -> pd.DataFrame: + """ + Assign each datetime to an N-day bin starting from start_date. + + Returns a DataFrame with: + - time_bin_start + - time_bin_end + + Both are strings in YYYY-MM-DD format. + """ + if step_days < 1: + raise ValueError("step_days must be >= 1.") + + ts = pd.to_datetime(datetimes, errors="coerce") + start_ts = pd.Timestamp(start_date) + end_ts = pd.Timestamp(end_date) + + day_offsets = (ts.dt.normalize() - start_ts).dt.days + bin_index = (day_offsets // step_days).astype("Int64") + + bin_start = start_ts + pd.to_timedelta(bin_index * step_days, unit="D") + bin_end = bin_start + pd.to_timedelta(step_days - 1, unit="D") + bin_end = bin_end.where(bin_end <= end_ts, end_ts) + + return pd.DataFrame( + { + "time_bin_start": bin_start.dt.strftime("%Y-%m-%d"), + "time_bin_end": bin_end.dt.strftime("%Y-%m-%d"), + }, + index=datetimes.index, + ) + +def _assign_grid_nodes( + df: pd.DataFrame, + grid_step_deg: float, + origin_west: float, + origin_south: float, +) -> pd.DataFrame: + """ + Assign observations to regular lon/lat grid nodes. + + Grid nodes are anchored at (origin_west, origin_south) and repeated every + grid_step_deg degrees. + + Each observation is assigned to exactly one nearest node, equivalent to + belonging to the square cell: + - lon_node ± 0.5 * grid_step_deg + - lat_node ± 0.5 * grid_step_deg + + Returns a copy of df with: + - grid_lon + - grid_lat + """ + if grid_step_deg <= 0: + raise ValueError("grid_step_deg must be > 0 for grid assignment.") + + out = df.copy() + + lon_offset = (out["longitude"] - origin_west) / grid_step_deg + lat_offset = (out["latitude"] - origin_south) / grid_step_deg + + out["grid_lon"] = origin_west + np.round(lon_offset) * grid_step_deg + out["grid_lat"] = origin_south + np.round(lat_offset) * grid_step_deg + + out["grid_lon"] = out["grid_lon"].astype(float) + out["grid_lat"] = out["grid_lat"].astype(float) + + return out + +def _safe_divide(num: pd.Series, den: pd.Series) -> pd.Series: + """ + Safe division returning NaN when denominator is zero or missing. + """ + n = pd.to_numeric(num, errors="coerce") + d = pd.to_numeric(den, errors="coerce") + return n / d.where(d > 0) + +def aggregate_ebird_to_files( + *, + ebd_bytes: Any, + sampling_bytes: Any, + polygon_bytes: Optional[bytes] = None, + polygon_filename_hint: str = "", + bbox: Optional[Sequence[float]] = None, + ebd_filename_hint: str = "", + sampling_filename_hint: str = "", + region_id: str, + agg: AggregationOptions, + vet: VettingOptions, + out_counts_csv: str, + out_presence_csv: str, + manifest_json: Optional[str] = None, +) -> List[str]: + """ + Read EBD + Sampling Event data, apply vetting and spatial filters, and aggregate in N-day bins and by species. + Outputs: + - counts CSV (A): time_bin_start, time_bin_end, location-lat, location-long, species, + total_count, n_checklists, n_checklists_all, n_complete_checklists, + n_detected_complete_checklists, sum_duration_hours_complete, + sum_party_hours_complete, reporting_rate, count_per_complete_checklist, + count_per_hour, count_per_party_hour_complete, + mean_count_when_detected, region_id + - presence CSV (B): time_bin_start, time_bin_end, location-lat, location-long, species, + presence, n_checklists, n_checklists_all, n_complete_checklists, + n_detected_complete_checklists, reporting_rate, region_id + Returns: + - sorted list of unique species found in the counts output. + """ + if gpd is None or Point is None: + raise ImportError("geopandas + shapely are required for polygon operations.") + if agg.step_days < 1: + raise ValueError("Aggregation step_days must be >= 1.") + if agg.grid_step_deg < 0: + raise ValueError("Aggregation grid_step_deg must be >= 0.") + + obs = _read_table_input(ebd_bytes) + samp = _read_table_input(sampling_bytes) + + _ensure_cols( + obs, + [ + "SAMPLING EVENT IDENTIFIER", + "LATITUDE", + "LONGITUDE", + "OBSERVATION DATE", + "SCIENTIFIC NAME", + "COMMON NAME", + "OBSERVATION COUNT", + ], + "EBD observations", + ) + _ensure_cols(samp, ["SAMPLING EVENT IDENTIFIER"], "Sampling events") + + key = "SAMPLING EVENT IDENTIFIER" + merged = obs.merge( + samp.drop_duplicates(subset=[key]), + on=key, + how="left", + suffixes=("", "_samp"), + ) + + merged["__dt"] = _parse_obs_datetime(merged) + merged["latitude"] = pd.to_numeric(merged["LATITUDE"], errors="coerce") + merged["longitude"] = pd.to_numeric(merged["LONGITUDE"], errors="coerce") + + m = _apply_vetting(merged, vet) + + poly = _resolve_spatial_filter( + polygon_bytes=polygon_bytes, + polygon_filename_hint=polygon_filename_hint, + bbox=bbox, + ) + poly_union = poly.dissolve().geometry.iloc[0] + + minx, miny, maxx, maxy = poly.total_bounds + if bbox is not None: + origin_west, origin_south = float(bbox[0]), float(bbox[1]) + else: + origin_west, origin_south = float(minx), float(miny) + + gdf = gpd.GeoDataFrame( + m, + geometry=[Point(xy) for xy in zip(m["longitude"], m["latitude"])], + crs="EPSG:4326", + ) + gdf = gdf[gdf.intersects(poly_union)].drop(columns=["geometry"]) + m = pd.DataFrame(gdf) + + # 2) Start/End date limits (inclusive) + start_dt = pd.Timestamp(agg.start_date) + end_dt = pd.Timestamp(agg.end_date) + pd.Timedelta(days=1) - pd.Timedelta(seconds=1) + m = m[(m["__dt"] >= start_dt) & (m["__dt"] <= end_dt)] + + # Parse counts + cnt = _parse_counts(m, treat_x_as_one=agg.treat_x_as_one) + # If count is missing (including treat_x_as_one=False), default to 1.0 for presence-like behavior + m["__count"] = cnt.fillna(1.0) + + # Optional clip + if vet.clip_counts_above and vet.clip_counts_above > 0: + m["__count"] = m["__count"].clip(upper=vet.clip_counts_above) + + # Species label for aggregation + m["species"] = m["SCIENTIFIC NAME"].fillna(m["COMMON NAME"]).astype(str) + + # Time binning: fixed-size bins in N days, anchored at agg.start_date + bins = _assign_n_day_bins( + m["__dt"], + start_date=agg.start_date, + end_date=agg.end_date, + step_days=agg.step_days, + ) + m["time_bin_start"] = bins["time_bin_start"] + m["time_bin_end"] = bins["time_bin_end"] + + # Spatial aggregation: + # - grid_step_deg == 0: keep original observation coordinates + # - grid_step_deg > 0: assign to regular grid nodes and aggregate by node + if agg.grid_step_deg > 0: + m = _assign_grid_nodes( + m, + grid_step_deg=agg.grid_step_deg, + origin_west=origin_west, + origin_south=origin_south, + ) + loc_lat_col = "grid_lat" + loc_lon_col = "grid_lon" + else: + loc_lat_col = "latitude" + loc_lon_col = "longitude" + + # Common grouping keys + spatial_time_keys = ["time_bin_start", "time_bin_end", loc_lat_col, loc_lon_col] + species_keys = spatial_time_keys + ["species"] + + # Complete checklist flag + if "ALL SPECIES REPORTED" in m.columns: + m["__complete_checklist"] = _truthy(m["ALL SPECIES REPORTED"]) + else: + m["__complete_checklist"] = False + + # Duration in hours + if "DURATION MINUTES" in m.columns: + m["__duration_hours"] = pd.to_numeric(m["DURATION MINUTES"], errors="coerce") / 60.0 + else: + m["__duration_hours"] = np.nan + # Number of observers + if "NUMBER OBSERVERS" in m.columns: + m["__n_observers"] = pd.to_numeric(m["NUMBER OBSERVERS"], errors="coerce") + else: + m["__n_observers"] = np.nan + # ------------------------------------------------------------------ + # 1) Denominator table from unique checklists at time_bin + spatial unit + # ------------------------------------------------------------------ + checklist_cols = [ + key, + "time_bin_start", + "time_bin_end", + loc_lat_col, + loc_lon_col, + "__complete_checklist", + "__duration_hours", + "__n_observers", + ] + checklist_frame = m[checklist_cols].drop_duplicates(subset=[key]) + + checklist_frame["__duration_hours_complete_only"] = checklist_frame["__duration_hours"].where( + checklist_frame["__complete_checklist"], + np.nan, + ) + + checklist_frame["__party_hours"] = checklist_frame["__duration_hours"] * checklist_frame["__n_observers"] + checklist_frame["__party_hours_complete_only"] = checklist_frame["__party_hours"].where( + checklist_frame["__complete_checklist"], + np.nan, + ) + + denom = ( + checklist_frame + .groupby(spatial_time_keys, dropna=False) + .agg( + n_checklists_all=(key, pd.Series.nunique), + n_complete_checklists=("__complete_checklist", "sum"), + sum_duration_hours_complete=("__duration_hours_complete_only", "sum"), + sum_party_hours_complete=("__party_hours_complete_only", "sum"), + ) + .reset_index() + ) + + # ------------------------------------------------------------------ + # 2) Species table from detections + # ------------------------------------------------------------------ + grp = m.groupby(species_keys, dropna=False) + + counts = grp.agg( + total_count=("__count", "sum"), + n_checklists=(key, pd.Series.nunique), + ).reset_index() + + pres = grp.agg( + presence=("__count", lambda x: 1), + n_checklists=(key, pd.Series.nunique), + ).reset_index() + + # ------------------------------------------------------------------ + # 3) Species table from detections on complete checklists only + # ------------------------------------------------------------------ + detected_complete = m[m["__complete_checklist"]].copy() + + if len(detected_complete) > 0: + grp_complete = detected_complete.groupby(species_keys, dropna=False) + det_complete = grp_complete.agg( + n_detected_complete_checklists=(key, pd.Series.nunique), + ).reset_index() + else: + det_complete = pd.DataFrame(columns=species_keys + ["n_detected_complete_checklists"]) + + # ------------------------------------------------------------------ + # 4) Join denominator + derived metrics + # ------------------------------------------------------------------ + counts = counts.merge(denom, on=spatial_time_keys, how="left") + counts = counts.merge(det_complete, on=species_keys, how="left") + counts["n_detected_complete_checklists"] = counts["n_detected_complete_checklists"].fillna(0) + + counts["reporting_rate"] = _safe_divide( + counts["n_detected_complete_checklists"], + counts["n_complete_checklists"], + ) + counts["count_per_complete_checklist"] = _safe_divide( + counts["total_count"], + counts["n_complete_checklists"], + ) + counts["count_per_hour"] = _safe_divide( + counts["total_count"], + counts["sum_duration_hours_complete"], + ) + counts["count_per_party_hour_complete"] = _safe_divide( + counts["total_count"], + counts["sum_party_hours_complete"], + ) + counts["mean_count_when_detected"] = _safe_divide( + counts["total_count"], + counts["n_checklists"], + ) + + counts["region_id"] = region_id + counts = counts.rename(columns={loc_lat_col: "location-lat", loc_lon_col: "location-long"}) + + pres = pres.merge(denom, on=spatial_time_keys, how="left") + pres = pres.merge(det_complete, on=species_keys, how="left") + pres["n_detected_complete_checklists"] = pres["n_detected_complete_checklists"].fillna(0) + + pres["reporting_rate"] = _safe_divide( + pres["n_detected_complete_checklists"], + pres["n_complete_checklists"], + ) + pres["count_per_complete_checklist"] = np.nan + pres["count_per_hour"] = np.nan + pres["count_per_party_hour_complete"] = np.nan + pres["mean_count_when_detected"] = np.nan + + pres["region_id"] = region_id + pres = pres.rename(columns={loc_lat_col: "location-lat", loc_lon_col: "location-long"}) + + os.makedirs(os.path.dirname(os.path.abspath(out_counts_csv)), exist_ok=True) + counts.to_csv(out_counts_csv, index=False, encoding="utf-8") + + os.makedirs(os.path.dirname(os.path.abspath(out_presence_csv)), exist_ok=True) + pres.to_csv(out_presence_csv, index=False, encoding="utf-8") + + if manifest_json: + if bbox is not None: + west, south, east, north = [float(v) for v in bbox] + spatial_filter = { + "type": "bbox", + "west": west, + "south": south, + "east": east, + "north": north, + } + else: + spatial_filter = { + "type": "polygon", + "filename_hint": polygon_filename_hint or "", + } + + payload: Dict[str, Any] = { + "created_at": dt.datetime.now().isoformat(), + "region_id": region_id, + "source_mode": "EBD + Sampling Event", + "spatial_filter": spatial_filter, + "time": { + "start": str(agg.start_date), + "end": str(agg.end_date), + "step_days": int(agg.step_days), + }, + "grid": { + "grid_step_deg": float(agg.grid_step_deg), + "origin_west": float(origin_west), + "origin_south": float(origin_south), + "mode": "grid" if agg.grid_step_deg > 0 else "original_coordinates", + }, + "derived_metrics": [ + "reporting_rate", + "count_per_complete_checklist", + "n_complete_checklists", + "count_per_hour", + "count_per_party_hour_complete", + "mean_count_when_detected", + ], + "vetting": vet.__dict__, + "outputs": { + "agg_counts_csv": out_counts_csv, + "agg_presence_csv": out_presence_csv, + }, + } + _write_manifest(manifest_json, payload) + + # 3) Species list for UI + species_list = sorted(counts["species"].dropna().astype(str).unique().tolist()) + return species_list + + +def read_species_from_agg_counts(agg_counts_csv: str) -> List[str]: + """ + Read unique species list from aggregated counts CSV. + """ + if not os.path.exists(agg_counts_csv): + return [] + df = pd.read_csv(agg_counts_csv, usecols=["species"]) + return sorted(df["species"].dropna().astype(str).unique().tolist()) + + +def export_tracks_from_aggregated_counts( + *, + agg_counts_csv: str, + tracks_csv: str, + region_id: str, + id_mode: str = "species", + species_filter: Optional[List[str]] = None, +) -> None: + """ + Convert aggregated counts file into Movebank-like pseudo-tracks CSV for ECODATA-Animate. + + If species_filter is provided and non-empty, export only those species. + + Output columns: + - timestamp + - location-long + - location-lat + - individual-local-identifier + - species + - count + - bin_id + - region_id + """ + if not os.path.exists(agg_counts_csv): + raise FileNotFoundError(f"Aggregated counts file not found: {agg_counts_csv}") + + df = pd.read_csv(agg_counts_csv) + + if species_filter: + keep = {str(s).strip() for s in species_filter if str(s).strip()} + if keep: + df = df[df["species"].astype(str).isin(keep)] + + ts = pd.to_datetime(df["time_bin_start"], errors="coerce") + df["timestamp"] = ts.dt.strftime("%Y-%m-%dT%H:%M:%S") + df["bin_id"] = df["time_bin_start"].astype(str) + + if id_mode == "species|region": + df["individual-local-identifier"] = df["species"].astype(str) + "|region:" + str(region_id) + else: + df["individual-local-identifier"] = df["species"].astype(str) + + out = pd.DataFrame( + { + "timestamp": df["timestamp"], + "location-long": df["location-long"], + "location-lat": df["location-lat"], + "individual-local-identifier": df["individual-local-identifier"], + "species": df["species"], + "count": df.get("total_count", 1), + "bin_id": df["bin_id"], + "region_id": region_id, + + "total_count": df.get("total_count"), + "n_checklists": df.get("n_checklists"), + "n_checklists_all": df.get("n_checklists_all"), + "n_complete_checklists": df.get("n_complete_checklists"), + "n_detected_complete_checklists": df.get("n_detected_complete_checklists"), + "sum_duration_hours_complete": df.get("sum_duration_hours_complete"), + "sum_party_hours_complete": df.get("sum_party_hours_complete"), + "reporting_rate": df.get("reporting_rate"), + "count_per_complete_checklist": df.get("count_per_complete_checklist"), + "count_per_hour": df.get("count_per_hour"), + "count_per_party_hour_complete": df.get("count_per_party_hour_complete"), + "mean_count_when_detected": df.get("mean_count_when_detected"), + } +) + + os.makedirs(os.path.dirname(os.path.abspath(tracks_csv)), exist_ok=True) + out.to_csv(tracks_csv, index=False, encoding="utf-8") diff --git a/ecodata/xr_tools.py b/ecodata/xr_tools.py index 6051013..0e12ef1 100644 --- a/ecodata/xr_tools.py +++ b/ecodata/xr_tools.py @@ -458,6 +458,6 @@ def set_time_encoding_modis(ds): ds : xarray.Dataset Dataset for which the encodings will be modified. This function will modify the encoding format in place. """ - modis_encoding = {'units': 'days since 2000-01-01', 'calendar': 'julian'} + modis_encoding = {'units': 'days since 2000-01-01', 'calendar': 'julian', "dtype": "float64"} for key in modis_encoding: ds.time.encoding[key] = modis_encoding[key] \ No newline at end of file diff --git a/scripts/build_dev.ps1 b/scripts/build_dev.ps1 index f5b2f26..0701b01 100644 --- a/scripts/build_dev.ps1 +++ b/scripts/build_dev.ps1 @@ -1,8 +1,11 @@ $env:ECODATA_VERSION = python print_ecodata_ver.py +if ($args.Count -gt 0) { + $env:ECODATA_INSTALL_BRANCH = $args[0] +} Write-Host "Building with:" Write-Host " ECODATA_VERSION = $env:ECODATA_VERSION" Write-Host " ECODATA_INSTALL_BRANCH = $env:ECODATA_INSTALL_BRANCH" Write-Host " REPO = $env:REPO" -constructor --config-filename construct_dev.yaml \ No newline at end of file +constructor --config-filename construct_dev.yaml diff --git a/scripts/build_dev.sh b/scripts/build_dev.sh index cbd5860..9cba4a4 100644 --- a/scripts/build_dev.sh +++ b/scripts/build_dev.sh @@ -1,9 +1,12 @@ ecodata_ver=`python print_ecodata_ver.py` export ECODATA_VERSION=${ecodata_ver} +if [ -n "$1" ]; then + export ECODATA_INSTALL_BRANCH="$1" +fi echo "Building with:" echo " ECODATA_VERSION = $ECODATA_VERSION" echo " ECODATA_INSTALL_BRANCH = $ECODATA_INSTALL_BRANCH" echo " REPO = $REPO" -constructor --config-filename construct_dev.yaml \ No newline at end of file +constructor --config-filename construct_dev.yaml diff --git a/scripts/construct_dev.yaml b/scripts/construct_dev.yaml index 1afd6b4..ad0f108 100644 --- a/scripts/construct_dev.yaml +++ b/scripts/construct_dev.yaml @@ -29,8 +29,14 @@ post_install_desc: Install ecodata development version specs: - python >=3.11 + - pip + - git - conda >=23.10 - conda-libmamba-solver - ecodata-menu={{ menu_version }} + - distributed + - geographiclib + - h5netcdf + - pyproj menu_packages: - ecodata-menu diff --git a/scripts/post_install.bat b/scripts/post_install.bat index 2457f0b..74beaa5 100644 --- a/scripts/post_install.bat +++ b/scripts/post_install.bat @@ -1,2 +1,6 @@ call "%PREFIX%\Scripts\activate.bat" -python -m pip install "git+https://github.com/%REPO%@%ECODATA_INSTALL_BRANCH%" +python -m pip install --no-cache-dir --upgrade --force-reinstall --no-deps "git+https://github.com/%REPO%@%ECODATA_INSTALL_BRANCH%" +if errorlevel 1 exit /b %errorlevel% + +python -c "import ecodata; from ecodata.app.apps import applications; print('Installed ecodata:', getattr(ecodata, '__version__', 'unknown'), ecodata.__file__); print('Registered apps:', ', '.join(sorted(applications))); assert applications" +if errorlevel 1 exit /b %errorlevel% diff --git a/scripts/post_install.sh b/scripts/post_install.sh index 54792b8..dd34d6b 100644 --- a/scripts/post_install.sh +++ b/scripts/post_install.sh @@ -1,2 +1,5 @@ +set -e + . "$PREFIX/etc/profile.d/conda.sh" && conda activate "$PREFIX" -python -m pip install git+https://github.com/${REPO}@${ECODATA_INSTALL_BRANCH} +python -m pip install --no-cache-dir --upgrade --force-reinstall --no-deps "git+https://github.com/${REPO}@${ECODATA_INSTALL_BRANCH}" +python -c "import ecodata; from ecodata.app.apps import applications; print('Installed ecodata:', getattr(ecodata, '__version__', 'unknown'), ecodata.__file__); print('Registered apps:', ', '.join(sorted(applications))); assert applications"