From 4764400a10eea91fdfe5a45b8e8118e2e6745520 Mon Sep 17 00:00:00 2001 From: olekshche Date: Mon, 25 Aug 2025 16:54:32 +0300 Subject: [PATCH 01/17] Updated Annotation Engine module to reflect upstream changes --- ecodata-env.yml | 1 + ecodata/__init__.py | 13 + ecodata/annotation_eng_func.py | 748 +++++++++++ ecodata/app/apps/__init__.py | 1 + ecodata/app/apps/annotation_engine_app.py | 1380 +++++++++++++++++++++ ecodata/movebank_functions.py | 976 +++++++++++++++ 6 files changed, 3119 insertions(+) create mode 100644 ecodata/annotation_eng_func.py create mode 100644 ecodata/app/apps/annotation_engine_app.py create mode 100644 ecodata/movebank_functions.py diff --git a/ecodata-env.yml b/ecodata-env.yml index 61fbcc8..dc895f5 100644 --- a/ecodata-env.yml +++ b/ecodata-env.yml @@ -25,3 +25,4 @@ dependencies: - ffmpeg - fiona - gdown<4.6 # gdown 4.6.something has a problem with our gdrive files +- distributed diff --git a/ecodata/__init__.py b/ecodata/__init__.py index fb95852..3f54aa5 100644 --- a/ecodata/__init__.py +++ b/ecodata/__init__.py @@ -46,3 +46,16 @@ select_time_range, # noqa thin_dataset, # noqa ) +from ecodata.movebank_functions import( + process_csv_interp_or_averaging, # noqa + validate_and_process_csv, + merge_csv_files_from_folder, + generate_individual_csvs_for_local_ids +) +from ecodata.annotation_eng_func import( + load_vector_extent_info, + load_taxa_and_ids_from_csv, + convert_tif_to_nc_before_annotation, + get_nc_bounds, + safe_open_nc_with_time_decoding +) diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py new file mode 100644 index 0000000..21deda6 --- /dev/null +++ b/ecodata/annotation_eng_func.py @@ -0,0 +1,748 @@ +import xarray as xr +import geopandas as gpd +from pathlib import Path +import pandas as pd +import re +from shapely.geometry import Point, box +import numpy as np +from datetime import datetime +import rasterio + +def safe_open_nc_with_time_decoding(path): + """ + Opens a NetCDF file with support for non-standard calendars: + julian, gregorian, 360_day, noleap, etc. + Always returns the 'time' coordinate as a pd.DatetimeIndex, + even if it was originally of cftime type. + """ + + try: + ds = xr.open_dataset(path, decode_times=False) + + time_name = _detect_time_name(ds) + if time_name is None: + raise ValueError("No time-like coordinate/variable found (e.g., 'time', 'valid_time').") + + # if time is in variables but not in coords — make it a coordinate + if time_name in ds.variables and time_name not in ds.coords: + ds = ds.set_coords(time_name) + + time_var = ds[time_name] + units = str(time_var.attrs.get("units","")) + calendar = str(time_var.attrs.get("calendar","standard")).lower() + + if "since" not in units: + # sometimes there are "epoch seconds" without 'since' + # add default: seconds since 1970-01-01 + if units.strip() == "" and pd.api.types.is_integer_dtype(time_var.dtype): + units = "seconds since 1970-01-01" + calendar = "proleptic_gregorian" + + decoded = xr.coding.times.decode_cf_datetime(time_var.values, units, calendar) + # if these are cftime objects — convert via str + if hasattr(decoded[0], "strftime"): + decoded = pd.to_datetime([str(d) for d in decoded]) + else: + decoded = pd.to_datetime(decoded) + + # rename the time coordinate to the unified 'time' + if time_name != "time": + ds = ds.assign_coords({time_name: decoded}).rename({time_name: "time"}) + else: + ds = ds.assign_coords(time=decoded) + + return ds + + except Exception as e: + raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") + +def get_nc_bounds(nc_path: str): + """ + Returns a dictionary of boundaries from .nc in CRS WGS84: {"S": ..., "N": ..., "W": ..., "E": ...} + """ + ds = safe_open_nc_with_time_decoding(nc_path) + # candidate coordinate names + lat_candidates = ("lat", "latitude", "y") + lon_candidates = ("lon", "longitude", "x") + + lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + if lat_name is None or lon_name is None: + raise ValueError("Could not detect lat/lon coordinate names in NetCDF") + + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + ds.close() + return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} + +def load_vector_extent_info(path): + try: + ext = Path(path).suffix.lower() + if ext not in [".shp", ".geojson"]: + raise ValueError("Unsupported file format. Please select a .shp or .geojson file.") + + gdf = gpd.read_file(path) + bounds = gdf.total_bounds # [minx, miny, maxx, maxy] + west, south, east, north = bounds + return path, south, north, west, east + except Exception as e: + raise RuntimeError(f"Failed to load vector file: {e}") + +def load_taxa_and_ids_from_csv(file_path): + """ + Reads a Movebank-style CSV and returns: + - DataFrame + - List of unique taxon names + - List of unique individual IDs + """ + try: + df = pd.read_csv(file_path) + columns = {re.sub(r"[-._\s]+", "_", col.lower()): col for col in df.columns} + id_key = "individual_local_identifier" + taxon_key = "individual_taxon_canonical_name" + id_col = columns.get(id_key) + taxon_col = columns.get(taxon_key) + if id_col is None: + return None, [], [], "No column found for individual-local-identifier" + + unique_ids = sorted(df[id_col].dropna().astype(str).unique()) + unique_taxa = sorted(df[taxon_col].dropna().astype(str).unique()) if taxon_col else [] + + return df, unique_taxa, unique_ids, None + + except Exception as e: + return None, [], [], str(e) + +def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, + boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, + out_csv_path=None): + """ + env_var_map: dict[str, str] — variable → file path + selected_env_vars: list[str] — selected variables + movebank_path: str — path to the Movebank CSV + selected_ids: list[str] — IDs for annotation + boundary_path: str — path to .shp or .geojson + """ + print("[DEBUG] Annotation started") + print("Selected variables:", selected_env_vars) + print("From files:", [env_var_map[v] for v in selected_env_vars]) + print("Selected IDs:", selected_ids) + print("Movebank file:", movebank_path) + print("Boundary file:", boundary_path) + print("Interpolation method:", interpolation_method) + + # === Step 1: Spatial filtering === + df_filtered, _ = filter_points_within_boundary(movebank_path, selected_ids, boundary_path, bbox=bbox) + if df_filtered.empty: + print("[WARNING] No points within the boundary.") + return + + # === Step 2: Loading and interpolation of environmental data === + result = load_selected_environmental_data(df_filtered, env_var_map, + selected_env_vars, movebank_path, + interpolation_method, smoothing_k=smoothing_k) + if result is None: + print("[ERROR] Environmental data was not loaded.") + return + + df_annotated, nc_start, nc_end = result + + # === Step 3: Time filtering === + df_time_filtered = df_annotated.copy() + print("[INFO] Full timestamp range preserved. Outside-NC values will be NaN.") + + # === Step 4: Saving the final result === + if out_csv_path: + out_path = Path(out_csv_path) + else: + out_path = Path(movebank_path).parent / "annotated_env.csv" + df_time_filtered = df_time_filtered.drop(columns=["geometry", "nc_lat", "nc_lon"], errors="ignore") + df_time_filtered.to_csv(out_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") + print(f"[INFO] Final filtered annotation saved to {out_path}") + + # === Step 5: Saving by individual ID === + output_folder = out_path.parent / "annotated_individuals" + output_folder.mkdir(parents=True, exist_ok=True) + + id_col = "individual_local_identifier" + if id_col in df_time_filtered.columns: + unique_ids = df_time_filtered[id_col].dropna().unique() + for uid in unique_ids: + df_id = df_time_filtered[df_time_filtered[id_col] == uid] + safe_uid = re.sub(r"[^\w\-]", "_", str(uid)) + out_file = output_folder / f"annotated_env_{safe_uid}.csv" + df_id.to_csv(out_file, index=False) + print(f"[INFO] Saved {len(unique_ids)} individual files to {output_folder}") + else: + print("[WARNING] Column 'individual_local_identifier' not found. Skipping per-ID export.") + + +def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=None, bbox=None): + print("[DEBUG] Filtering is started") + df = pd.read_csv(movebank_path) + df.columns = [re.sub(r"[-:.\s]+", "_", col.lower()) for col in df.columns] + if "location_long" in df.columns and "location_lon" not in df.columns: + df["location_lon"] = df["location_long"] + if "timestamp" not in df.columns and "eobs_start_timestamp" in df.columns: + df["timestamp"] = df["eobs_start_timestamp"] + + required_cols = {"location_lat", "location_lon", "individual_local_identifier", "timestamp"} + if not required_cols.issubset(df.columns): + raise ValueError(f"Required columns are missing in Movebank file. Missing: {required_cols - set(df.columns)}") + + # ID-filter + df = df[df["individual_local_identifier"].isin(selected_ids)] + df = interpolate_missing_coordinates(df) + + output_path = Path(movebank_path).parent / "trimmed.csv" + if bbox is not None: + S, N, W, E = map(float, (bbox["S"], bbox["N"], bbox["W"], bbox["E"])) + m = df["location_lat"].between(S, N) & df["location_lon"].between(W, E) + df = df.loc[m].copy() + df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] + gdf_filtered = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + try: + if gdf_filtered.empty: + print("[INFO] No points within bbox. File not saved.") + else: + gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] (bbox) Data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save (bbox) data: {e}") + return gdf_filtered, output_path + + # case: boundary from shp/geojson + df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] + gdf_points = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + if boundary_path is None: + print("[INFO] No boundary provided. Skipping spatial clipping (all selected IDs kept).") + try: + gdf_points.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] (No-boundary) Data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save (no-boundary) data: {e}") + return gdf_points, output_path + + gdf_boundary = gpd.read_file(boundary_path) + if gdf_boundary.crs != gdf_points.crs: + gdf_boundary = gdf_boundary.to_crs(gdf_points.crs) + + gdf_filtered = gpd.sjoin(gdf_points, gdf_boundary[["geometry"]], predicate="within", how="inner").drop(columns="index_right") + + try: + if gdf_filtered.empty: + print("[INFO] No points within boundary. File not saved.") + else: + gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] Filtered data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save filtered data: {e}") + + return gdf_filtered, output_path + +# UNUSED OPTION +def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_end: pd.Timestamp) -> pd.DataFrame: + df = df.copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") #? pd.to_datetime + filtered_df = df[(df["timestamp"] >= nc_start) & (df["timestamp"] <= nc_end)] + print(f"[INFO] Filtered {len(filtered_df)} / {len(df)} rows within NetCDF time range: {nc_start} — {nc_end}") + return filtered_df + +def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: + """ + Interpolates missing values in 'location_lat' and 'location_lon' columns + based on the 'timestamp'. Removes rows with invalid timestamps. + """ + required_cols = {"timestamp", "location_lat", "location_lon"} + if not required_cols.issubset(df.columns): + raise ValueError(f"DataFrame must contain columns: {required_cols}") + + df = df.copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") + + n_missing = df["timestamp"].isna().sum() + if n_missing > 0: + print(f"[INFO] {n_missing} rows with missing or invalid timestamps were removed before interpolation.") + + df = df.dropna(subset=["timestamp"]) # Remove Na before creating the index + df = df.sort_values("timestamp") + df.set_index("timestamp", inplace=True) + + for coord in ["location_lat", "location_lon"]: + df[coord] = pd.to_numeric(df[coord], errors="coerce") + + df[["location_lat", "location_lon"]] = df[["location_lat", "location_lon"]].interpolate( + method="time", limit_direction="both" + ) + + df = df.reset_index() + return df + +def load_selected_environmental_data(df, env_var_map, selected_vars, + movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2): + """ + Wrapper that calls the appropriate annotation function depending on the interpolation method. + Supports: + - "Nearest neighbour (time-linear)" + - "IDW (time-linear)" + """ + label = (interpolation_method or "").strip().lower() + label = label.replace("neighbor", "neighbour") ###?? + + is_nearest = label.startswith("nearest") + is_idw = ("idw" in label) or ("inverse distance" in label) + + if is_nearest: + return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) + elif is_idw: + return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) + else: + raise ValueError(f"Unknown interpolation method: {interpolation_method}") + +def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): + """ + Temporal: linear interpolation (1D per-point) + Spatial: nearest neighbour (1 grid node per point) + """ + from shapely.geometry import Point + + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + + # coordinate placeholders + nc_latitudes = np.full(len(out), np.nan) + nc_longitudes = np.full(len(out), np.nan) + + for var in selected_vars: + file_path = env_var_map.get(var) + out[var] = np.nan + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {var} not found: {file_path}") + continue + + try: + ds = safe_open_nc_with_time_decoding(file_path) + if var not in ds: + print(f"[WARNING] Variable {var} not in {file_path}") + continue + + da = ds[var] + # determine basic axes + dims = list(da.dims) + lat_dim = "lat" if "lat" in dims else "latitude" + lon_dim = "lon" if "lon" in dims else "longitude" + ds = _ensure_sorted(ds, lat_dim, lon_dim) + da = ds[var] + dims = list(da.dims) + time_dim = "time" if "time" in dims else next( + (d for d in ("valid_time","forecast_time","verification_time","t","Time") if d in dims), + None + ) + if time_dim is None: + raise ValueError(f"No time-like dimension in {var}: dims={dims}") + if time_dim != "time": + ds = ds.rename({time_dim: "time"}) + da = ds[var] + dims = list(da.dims) + time_dim = "time" + + # Cut out unnecessary dimentions: pressure_level, number, expver, etc. + extra = [d for d in dims if d not in (time_dim, lat_dim, lon_dim)] + if extra: + sel = {} + for d in extra: + dl = d.lower() + try: + coord = ds.coords[d] if d in ds.coords else ds[d] + except Exception: + coord = None + + if dl in ("pressure_level", "isobaricinhpa", "level"): + idx = 0 + if coord is not None: + try: + vals = np.asarray(coord.values, dtype=float) + # обрати рівень, найближчий до 1000 гПа (як у вашому файлі) + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + idx = int(np.nanargmin(np.abs(vals - 1000.0))) + except Exception: + idx = 0 + sel[d] = idx + else: + # інші дод. виміри → беремо перший елемент + sel[d] = 0 + + da = da.isel(**sel).squeeze() # (time, lat, lon) + + glat = ds[lat_dim].values + glon = ds[lon_dim].values + gtime = pd.to_datetime(ds["time"].values).values # datetime64[ns] + gtime_ts = pd.to_datetime(gtime) # Timestamp indexable + + # one line at a time — only one series of 1 grid + for idx, row in out.iterrows(): + t = row["timestamp"] + xlat = row["location_lat"] + xlon = row["location_lon"] + + # out of time range → NaN + if t < gtime_ts.min() or t > gtime_ts.max(): + continue + + ii = _nearest_index(glat, xlat) + jj = _nearest_index(glon, xlon) + + # extract the time series of one grid + # expect dims ("time", lat_dim, lon_dim) + series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) + val = _interp1d_time(gtime_ts, series, t) + out.at[idx, var] = val + nc_latitudes[idx] = glat[ii] + nc_longitudes[idx] = glon[jj] + + except Exception as e: + print(f"[ERROR] {var}: {e}") + continue + + out["nc_lat"] = nc_latitudes + out["nc_lon"] = nc_longitudes + out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + return out, pd.NaT, pd.NaT + +def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): + """ + Temporal: linear (1D per-point per-neighbour) + Spatial: IDW over k nearest grid nodes (k = smoothing_k, chosen in UI) + """ + from shapely.geometry import Point + + k = max(2, int(smoothing_k)) + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + out["nc_lat"] = out["location_lat"].values + out["nc_lon"] = out["location_lon"].values + + for var in selected_vars: + file_path = env_var_map.get(var) + out[var] = np.nan + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {var} not found: {file_path}") + continue + + try: + ds = safe_open_nc_with_time_decoding(file_path) + if var not in ds: + print(f"[WARNING] Variable {var} not in {file_path}") + continue + + da = ds[var] + dims = set(da.dims) + lat_dim = "lat" if "lat" in dims else "latitude" + lon_dim = "lon" if "lon" in dims else "longitude" + ds = _ensure_sorted(ds, lat_dim, lon_dim) + da = ds[var] + + # — find the name of the time dimension and unify to "time" + dims_list = list(da.dims) + time_dim = "time" if "time" in dims_list else next( + (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims_list), None + ) + if time_dim is None: + raise ValueError(f"No time-like dimension in {var}: dims={dims_list}") + if time_dim != "time": + ds = ds.rename({time_dim: "time"}) + da = ds[var] + dims_list = list(da.dims) + + # — remove unnecessary measurements (pressure, ensemble, etc.) + extra_dims = [d for d in dims_list if d not in ("time", lat_dim, lon_dim)] + if extra_dims: + sel = {} + # special logic for pressure levels: take 1000 hPa if it exists; otherwise the first + level_keys = {"pressure_level", "isobaricInhPa", "level"} + for d in extra_dims: + if d in da.coords and d.lower() in {k.lower() for k in level_keys}: + try: + lev = np.asarray(ds[d].values, dtype=float) + sel[d] = int(np.nanargmin(np.abs(lev - 1000.0))) # closest to 1000 hPa + except Exception: + sel[d] = 0 + else: + sel[d] = 0 + da = da.isel(**sel).squeeze() + + glat = ds[lat_dim].values + glon = ds[lon_dim].values + gtime = pd.to_datetime(ds["time"].values).values + gtime_ts = pd.to_datetime(gtime) + + for idx, row in out.iterrows(): + t = row["timestamp"] + xlat = row["location_lat"] + xlon = row["location_lon"] + + if t < gtime_ts.min() or t > gtime_ts.max(): + continue + + # find k nearest nodes through local window + nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) + + vals = [] + dists = [] + for ii, jj in nn_idx: + series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) + v = _interp1d_time(gtime_ts, series, t) + vals.append(v) + dists.append(np.hypot(glat[ii] - xlat, glon[jj] - xlon)) + + out.at[idx, var] = _idw(vals, dists, p=2) + + except Exception as e: + print(f"[ERROR] {var}: {e}") + continue + + out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + return out, pd.NaT, pd.NaT + +def convert_tif_to_nc_before_annotation(tif_paths, output_dir): + """ + onverts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. + For each variable, builds a data(time, lat, lon) array. + Returns the path to the generated .nc file. + """ + tif_paths = [str(Path(p)) for p in tif_paths] + if not tif_paths: + raise ValueError("No .tif files provided") + + # 1) Group files by variable + by_var = {} + for tif in tif_paths: + vname = parse_appeears_variable_name(tif) + by_var.setdefault(vname, []).append(tif) + + lat = lon = None + data_vars = {} + + for vname, files in by_var.items(): + times = [] + planes = [] + first_geo = True + + for tif in sorted(files): + tif_name = Path(tif).name + t = parse_time_from_filename(tif_name) + times.append(t) + + with rasterio.open(tif) as src: + arr = src.read(1).astype("float32") + nodata = src.nodata + if nodata is not None: + arr = np.where(arr == nodata, np.nan, arr) + + # Read scale_factor from tags (if present); otherwise use a 0.0001 heuristic for int16 NDVI/EVI + scale = None + try: + tags = src.tags() + for k in ("scale_factor", "SCALE", "Scale", "scale"): + if k in tags: + scale = float(tags[k]); break + except Exception: + pass + if scale is None and (np.nanmin(arr) >= -10000) and (np.nanmax(arr) <= 10000): + scale = 0.0001 + if scale is not None: + arr = arr * scale + + planes.append(arr) + + if first_geo: + transform = src.transform + h, w = src.height, src.width + lon = np.array([transform * (i, 0) for i in range(w)])[:, 0] + lat = np.array([transform * (0, j) for j in range(h)])[:, 1] + first_geo = False + + data_array = np.stack(planes) # (time, lat, lon) + time_index = np.array(times) + + da = xr.DataArray( + data_array, + dims=["time", "lat", "lon"], + coords={"time": time_index, "lat": lat, "lon": lon}, + name=vname + ) + data_vars[vname] = da + + ds = xr.Dataset(data_vars) + + base = Path(tif_paths[0]).name.split("_")[0] + safe_base = re.sub(r"[^\w\-]", "_", base) + out = Path(output_dir) / f"{safe_base}_nc_output.nc" + ds.to_netcdf(out) + return str(out) + + + + +def parse_time_from_filename(filename): + """ + Example: MOD13A1.061__500m_16_days_NDVI_doy2014145000000_aid0001.tif + Parses date using "doyYYYYDDD", where DDD is the day of year. + """ + match = re.search(r'doy(\d{4})(\d{3})', filename) + if match: + year, doy = int(match.group(1)), int(match.group(2)) + return datetime.strptime(f"{year}{doy}", "%Y%j") + else: + raise ValueError(f"Cannot parse time from filename: {filename}") + +# --- AppEEARS variable-name parser --- # +def parse_appeears_variable_name(tif_path: str) -> str: + """ + Returns the variable/layer name for an AppEEARS GeoTIFF. + Order: + (A) try reading tags (long_name, DESCRIPTION, Layer...) + (B) if not available — parse the filename: + - token before 'doyYYYYDDD' (typical: ..._NDVI_doy2014145_...) + - or one of the known tokens in KNOWN_TOKENS + (C) fallback -> "data" + """ + + + p = Path(tif_path) + name = p.name + + # A) read TIF tags + try: + with rasterio.open(tif_path) as src: + tags = src.tags() + for key in ("long_name", "DESCRIPTION", "Description", "Layer", "LAYER", "BAND_NAME"): + if key in tags and str(tags[key]).strip(): + raw = str(tags[key]).strip() + var = re.sub(r"[^\w\-]+", "_", raw) + return var + except Exception: + pass + + # B1) token before "doyYYYYDDD" + m = re.search(r"_([A-Za-z0-9][A-Za-z0-9_]+)_doy\d{7}", name) + if m: + return m.group(1) + + # B2) known tokens (common AppEEARS layers; list is incomplete but useful) + KNOWN_TOKENS = { + "NDVI", "EVI", + "LST_Day_1km", "LST_Night_1km", "LST_Day_1KM", "LST_Night_1KM", "QC_Day", "QC_Night", + "Lai_500m", "Fpar_500m", "FparLai_QC", + "Nadir_Reflectance_Band1", "Nadir_Reflectance_Band2", "Nadir_Reflectance_Band3", + "Nadir_Reflectance_Band4", "Nadir_Reflectance_Band5", "Nadir_Reflectance_Band6", + "Nadir_Reflectance_Band7", + "SurfReflect_Band1", "SurfReflect_Band2", "SurfReflect_Band3", + "SurfReflect_Band4", "SurfReflect_Band5", "SurfReflect_Band6", "SurfReflect_Band7", + "NDSI_Snow_Cover", + "VIIRS_NDVI", "VIIRS_EVI", + "BurnDate", "BurnDate_Uncertainty", "LAI", "FPAR", "QC" + } + candidates = sorted([t for t in KNOWN_TOKENS if t in name], key=len, reverse=True) + if candidates: + return candidates[0] + + parts = re.split(r"[_.]", name) + parts = [t for t in parts if t and t.lower() != "tif"] + parts = [t for t in parts if not t.lower().startswith("aid")] + parts = [t for t in parts if not re.fullmatch(r"\d{7,8}", t) and not t.startswith("doy")] + if parts: + parts.sort(key=len, reverse=True) + return parts[0] + + return "data" + + +def _ensure_sorted(ds, lat_dim, lon_dim): + if (np.diff(ds[lat_dim].values) < 0).all(): + ds = ds.sortby(lat_dim) + if (np.diff(ds[lon_dim].values) < 0).all(): + ds = ds.sortby(lon_dim) + return ds + +def _nearest_index(arr, x): + # array arr growing: fast via searchsorted + local check + idx = np.searchsorted(arr, x) + if idx == 0: + return 0 + if idx >= len(arr): + return len(arr) - 1 + return idx if abs(arr[idx] - x) < abs(arr[idx-1] - x) else idx-1 + +def _interp1d_time(grid_times_ts, series_vals, t_target): + """Linear 1D interpolation over time (Timestamp => float64). Ignores NaN in the series.""" + # filter NaN in a series + mask = ~np.isnan(series_vals) + if mask.sum() < 2: + return np.nan + x = grid_times_ts[mask].astype("int64") # ns → int64 + y = series_vals[mask].astype(float) + xi = np.int64(pd.Timestamp(t_target).value) + # if out of range — return NaN + if xi < x.min() or xi > x.max(): + return np.nan + return np.interp(xi, x, y) + +def _k_nearest_indices(glat, glon, xlat, xlon, k): + """Returns an array of indices (ilat, ilon) of length k among candidates from the local window""" + # first the shortest path is the nearest grid + i0 = _nearest_index(glat, xlat) + j0 = _nearest_index(glon, xlon) + + # form a small window around (i0, j0) sufficient to find k neighbors + # empirically: radius r = ceil(max(1, sqrt(k))) → (2r+1)^2 >= k + r = int(np.ceil(max(1, np.sqrt(k)))) + i_min, i_max = max(0, i0 - r), min(len(glat) - 1, i0 + r) + j_min, j_max = max(0, j0 - r), min(len(glon) - 1, j0 + r) + + # collect candidates in the window + cand = [] + for ii in range(i_min, i_max + 1): + for jj in range(j_min, j_max + 1): + d = np.hypot(glat[ii] - xlat, glon[jj] - xlon) + cand.append((d, ii, jj)) + cand.sort(key=lambda t: t[0]) + top = cand[:k] + return [(ii, jj) for _, ii, jj in top] + +def _idw(values, distances, p=2): + """IDW average for already interpolated values. distances > 0 (add eps).""" + vals = np.array(values, dtype=float) + d = np.array(distances, dtype=float) + 1e-12 + w = 1.0 / (d ** p) + # ignore NaN in vals + mask = ~np.isnan(vals) + if not mask.any(): + return np.nan + w_sel = w[mask] + v_sel = vals[mask] + return np.sum(w_sel * v_sel) / np.sum(w_sel) + +# --- NEW: helper --- +def _detect_time_name(ds): + # 1)quick candidates by name + name_candidates = ("time","valid_time","forecast_time","verification_time","t","Time","datetime","date") + for c in name_candidates: + if c in ds.coords or c in ds.variables: + return c + + # 2) CF attributes: standard_name = "time" or units with the word "since" + for name, var in ds.variables.items(): + stdn = str(var.attrs.get("standard_name","")).lower() + units = str(var.attrs.get("units","")) + if stdn == "time": + return name + if "since" in units: + return name + return None \ No newline at end of file diff --git a/ecodata/app/apps/__init__.py b/ecodata/app/apps/__init__.py index 253765d..6eff0d3 100644 --- a/ecodata/app/apps/__init__.py +++ b/ecodata/app/apps/__init__.py @@ -2,4 +2,5 @@ import ecodata.app.apps.movie_maker_app # noqa import ecodata.app.apps.subsetter_app # noqa import ecodata.app.apps.tracks_explorer_app # noqa +import ecodata.app.apps.annotation_engine_app # noqa from ecodata.panel_utils import applications # noqa diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py new file mode 100644 index 0000000..16aa0a5 --- /dev/null +++ b/ecodata/app/apps/annotation_engine_app.py @@ -0,0 +1,1380 @@ +import logging +from pathlib import Path +import panel as pn +import param +import pandas as pd +from panel.io.loading import start_loading_spinner, stop_loading_spinner +from ecodata.app.models import FileSelector +from ecodata.panel_utils import param_widget, register_view, try_catch, rename_param_widgets +from ecodata.app.config import DEFAULT_TEMPLATE +from datetime import datetime +import re +from ecodata import validate_and_process_csv, load_vector_extent_info, load_taxa_and_ids_from_csv +from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only +from ecodata.annotation_eng_func import start_annotation_process,convert_tif_to_nc_before_annotation, get_nc_bounds, safe_open_nc_with_time_decoding + +logger = logging.getLogger(__file__) + +class movebank_annotation_engine(param.Parameterized): + local_ID_file = param_widget(FileSelector(constrain_path=False, expanded=True, size=10)) + load_data_button = param_widget(pn.widgets.Button(name="Load data", button_type="primary")) + taxon_name_val = param_widget( + pn.widgets.MultiSelect(name="Taxon name (press Ctrl for multiple selection)", options=[], height = 140, disabled=True) + ) + individual_ID = param_widget( + pn.widgets.MultiSelect(name="Individual ID (press Ctrl for multiple selection)", options=[], height = 140, disabled=True) + ) + simple_interp_button = param_widget(pn.widgets.Button(name="Simple interpolation (missing ≤ 1 day)", button_type="primary")) + deployment_time_gap = param_widget( + pn.widgets.IntInput(name="Deployment time gap (minutes)", value=60, step=60, start=0) + ) + min_expected_obs = param_widget( + pn.widgets.IntInput(name="Minimum expected number of observations(per deployment)", value=100, step=50, start=10) + ) + + time_selection_ID = param_widget( + pn.widgets.DatetimeRangeSlider( + name="Select Time Range", + start=datetime(2010, 1, 1), + end=datetime(2025, 12, 31), + value=(datetime(2016, 6, 13), datetime(2016, 6, 14)), + step=2_592_000_000 + ) + ) + time_interval = param_widget(pn.widgets.IntInput(name="Timestep for Interpolation/Averaging (minutes)", value=30, step=1, start=1)) + start_from_midnight = param_widget(pn.widgets.Checkbox(name="First timestamp = 00:00:00", value=False)) + out_csv_name = param_widget(pn.widgets.TextInput(name="Output CSV", value=str(Path.home() / "Downloads" / "subset.csv"))) + make_csv = param_widget(pn.widgets.Button(name="Make CSV", button_type="primary")) + merge_files = param_widget(pn.widgets.Checkbox(name="Merge files after processing", value=False)) + delete_individual_ID_files = param_widget(pn.widgets.Checkbox(name="Delete individual files after merge", value=True)) + + folder_to_merge = param_widget(pn.widgets.TextInput(name="Folder with CSV files to merge (select folder)", value=str(Path.home() / "Downloads"))) + delete_empty_columns = param_widget(pn.widgets.Checkbox(name="Delete empty columns after merging", value=False)) + out_merged_csv_name = param_widget(pn.widgets.TextInput(name="Output merged CSV", value=str(Path.home() / "Downloads" / "merged.csv"))) + merge_files_button = param_widget(pn.widgets.Button(name="Merge files in folder", button_type="primary")) + + # === Annotation Engine widgets === + env_data_selector = param_widget( + FileSelector( + name="Environmental data (.nc)", + constrain_path=False, + expanded=True, + size=10 + ) + ) + bound_data_selector = param_widget(FileSelector(name="Boundary data (.shp)", constrain_path=False, expanded=True, size=10)) + movement_data_selector = param_widget(FileSelector(name="Movebank data (.csv)", constrain_path=False, expanded=True, size=10)) + load_env_button = pn.widgets.Button(name="Load environmental data", button_type="primary") + load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary") + load_bound_button = pn.widgets.Button(name="Load boundary data", button_type="primary") + reset_bound_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary") + env_data_multiselect = pn.widgets.MultiSelect(name="Environmental variables (use Ctrl for multiple)", options=[], height = 140 ) + taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl for multiple)", height = 140) + id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl for multiple)", height = 140) + env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + movement_info = pn.pane.HTML("File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + control_smoothing = pn.widgets.Select( + name="Number of nearest grid points", + options=["2", "4", "6", "8"], + value="4" + ) + output_path = pn.widgets.TextInput(name="Output path", value=str(Path.home() / "Downloads" / "annotated_env.csv")) + boundary_info_str = pn.pane.HTML( + "Boundary file: not selected
Spatial range: = environment data boundary", + name="", + styles={"white-space": "pre-wrap"}, + sizing_mode="stretch_width" + ) + interpolation_method = pn.widgets.Select( + name="Interpolation method (spatial)", + options=["Nearest neighbour (time-linear)", "Inverse Distance Weighting (time-linear)"], + value="Inverse Distance Weighting (time-linear)" + ) + make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") + + + status_text = param.String("Ready...") + #TIF widgets + # === TIF Annotation Engine widgets === + tif_env_data_selector = param_widget( + FileSelector( + name="Select any .tif file in folder", + constrain_path=False, + expanded=True, + size=10 + ) + ) + tif_movement_data_selector = param_widget(FileSelector(name="Movebank data", constrain_path=False, expanded=True,size=10)) + tif_bound_data_selector = param_widget(FileSelector(name="Boundary data", constrain_path=False, expanded=True, size=10)) + + tif_load_env_button = pn.widgets.Button(name="Load TIF environmental data", button_type="primary") + tif_load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary") + tif_load_bound_button = pn.widgets.Button(name="Load boundary data", button_type="primary") + tif_reset_bound_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary") + tif_control_smoothing = pn.widgets.Select( + name="Number of nearest grid points", + options=["2", "4", "6", "8"], + value="4" + ) + tif_env_data_multiselect = pn.widgets.MultiSelect(name="netCDF Environmental variables", options=[], height = 140) + tif_taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon", height = 140) + tif_id_multiselect = pn.widgets.MultiSelect(name="Select ID", height = 140) + tif_env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + tif_movement_info = pn.pane.HTML("File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width") + tif_output_path = pn.widgets.TextInput(name="Output path", value=str(Path.home() / "Downloads" / "annotated_env_tif.csv")) + tif_boundary_info_str = pn.pane.HTML( + "Boundary file: not selected
Spatial range: = environment data boundary", + sizing_mode="stretch_width" + ) + + tif_interpolation_method = pn.widgets.Select( + name="Interpolation method (spatial)", + options=["Nearest neighbour (time-linear)", "Inverse Distance Weighting (time-linear)"], + value="Inverse Distance Weighting (time-linear)" + ) + tif_make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") + + + def __init__(self, **params): + super().__init__(**params) + + self.interpolation_method.name = "Spatial interpolation method (.nc)" + self.tif_interpolation_method.name = "Spatial interpolation method (.tif)" + rename_param_widgets( + self, + [ + "local_ID_file", "load_data_button", + "taxon_name_val", "individual_ID", "simple_interp_button", + "deployment_time_gap", "min_expected_obs", + "time_selection_ID", "time_interval", + "start_from_midnight", "out_csv_name", + "make_csv", "merge_files", + "delete_individual_ID_files","folder_to_merge", + "delete_empty_columns", "out_merged_csv_name", + "merge_files_button", + # === NC Annotation tab === + "env_data_selector", + "bound_data_selector", "movement_data_selector", + "load_env_button", "load_bound_button", "reset_bound_button", + "load_movement_button", "env_data_multiselect", + "taxon_multiselect", "id_multiselect", + "boundary_info_str", "interpolation_method", + "control_smoothing", + "env_info", "movement_info" ,"output_path", + "make_annotation_button", + # === TIF Annotation tab === + "tif_env_data_selector", + "tif_movement_data_selector", + "tif_bound_data_selector","tif_reset_bound_button", + "tif_env_data_multiselect", + "tif_taxon_multiselect", + "tif_id_multiselect", + "tif_interpolation_method", "tif_control_smoothing", + "tif_env_info", "tif_movement_info", + "tif_make_annotation_button" + ] + ) + + self.df = None + self.alert = pn.pane.Markdown(self.status_text) + NC_H = 1080 + # === NC tab === + self._nc_col1 = self._section( + "Environmental data (.nc)", + pn.Column(self.env_data_selector, sizing_mode="stretch_width"), + self.load_env_button, + self.env_data_multiselect, + self.env_info, + self.interpolation_method, + self.control_smoothing, + self.output_path, + self.make_annotation_button, + height=NC_H, + ) + self._nc_col2 = self._section( + "Movebank data (.csv)", + pn.Column(self.movement_data_selector, sizing_mode="stretch_width"), + self.load_movement_button, + self.taxon_multiselect, + self.movement_info, + height=NC_H, + ) + self._nc_col3 = self._section( + "Boundary data (.shp/.geojson)", + pn.Column(self.bound_data_selector, sizing_mode="stretch_width"), + pn.Row(self.load_bound_button, self.reset_bound_button), + self.id_multiselect, + self.boundary_info_str, + height=NC_H, + ) + + # synchronize heights after rendering + pn.state.onload(self._sync_nc_column_heights) + + self.anotation_engine_tab = pn.Column( + pn.pane.Markdown("### Annotation engine - .nc", sizing_mode="stretch_width"), + pn.GridBox( + self._nc_col1, self._nc_col2, self._nc_col3, + ncols=3, sizing_mode="stretch_width", + ), + ) + + # TIF + TIF_H = 1080 + self._tif_col1 = self._section( + "Environmental data (.tif) - select one (of)", + pn.Column(self.tif_env_data_selector, sizing_mode="stretch_width"), + self.tif_load_env_button, + self.tif_env_data_multiselect, + self.tif_env_info, + self.tif_interpolation_method, + self.tif_control_smoothing, + self.tif_output_path, + self.tif_make_annotation_button, + height=TIF_H, + ) + + self._tif_col2 = self._section( + "Movebank data (.csv)", + pn.Column(self.tif_movement_data_selector, sizing_mode="stretch_width"), + self.tif_load_movement_button, + self.tif_taxon_multiselect, + self.tif_movement_info, + height=TIF_H, + ) + + self._tif_col3 = self._section( + "Boundary data (.shp/.geojson)", + pn.Column(self.tif_bound_data_selector, sizing_mode="stretch_width"), + pn.Row(self.tif_load_bound_button, self.tif_reset_bound_button), + self.tif_id_multiselect, + self.tif_boundary_info_str, + height=TIF_H, + ) + + self.anotation_engine_tif_tab = pn.Column( + pn.pane.Markdown("### Annotation engine - .tif", sizing_mode="stretch_width"), + pn.GridBox( + self._tif_col1, self._tif_col2, self._tif_col3, + ncols=3, + sizing_mode="stretch_width", + ), + ) + + self.crop_interpolate_tab = pn.Column( + pn.pane.Markdown("### Crop files"), + self.local_ID_file, + self.load_data_button, + pn.Row( + self.taxon_name_val, + self.individual_ID, + ), + self.simple_interp_button, + pn.Column(self.deployment_time_gap, self.min_expected_obs), + self.time_selection_ID, + pn.Row(self.time_interval, self.start_from_midnight), + self.out_csv_name, + self.make_csv, + self.merge_files, + self.delete_individual_ID_files, + self.alert + ) + + self.merge_tab = pn.Column( + pn.pane.Markdown("### Merge files (Please select a **folder** with CSV files)"), + self.folder_to_merge, + self.delete_empty_columns, + self.out_merged_csv_name, + self.merge_files_button, + ) + + self.view = pn.Tabs( + ("Annotation engine - .nc", self.anotation_engine_tab), + ("Annotation engine - .tif", self.anotation_engine_tif_tab), + ("Crop & interpolate csv", self.crop_interpolate_tab), + ("Merge csv", self.merge_tab), + ) + + self.simple_interp_button.on_click(self.run_interpolate_missing_only) + self.load_data_button.on_click(self.load_ids_from_file) + self.make_csv.on_click(self.run_make_csv) + self.merge_files_button.on_click(self.run_merge_files) + self.taxon_name_val.param.watch(self.update_individual_ids_by_taxon, 'value') + self.load_env_button.on_click(self.load_env_data) + self.load_bound_button.on_click(self.load_boundary_data) + self.reset_bound_button.on_click(self.reset_boundary_data) + self.load_movement_button.on_click(self.load_movement_data) + self.taxon_multiselect.param.watch(self.update_annotation_ids_by_taxon, 'value') + self.make_annotation_button.on_click(self.run_annotation) + self.env_data_multiselect.param.watch(lambda e: self.update_env_info_text(e.new), "value") + self.taxon_multiselect.param.watch(lambda e: self.update_movement_info_text("Taxons", e.new), "value") + self.id_multiselect.param.watch(lambda e: self.update_movement_info_text("IDs", e.new), "value") + self.interpolation_method.param.watch(self._update_smoothing_options, 'value') + ######TIF on click + self.tif_load_env_button.on_click(self.load_env_data_tif) + self.tif_load_bound_button.on_click(self.load_boundary_data_tif) + self.tif_reset_bound_button.on_click(self.reset_boundary_data) + self.tif_load_movement_button.on_click(self.load_movement_data_tif) + self.tif_make_annotation_button.on_click(self.run_annotation_tif) + self.tif_taxon_multiselect.param.watch(self.update_annotation_ids_by_taxon_tif, 'value') + self.tif_env_data_multiselect.param.watch(lambda e: self.update_env_info_text_tif(e.new), "value") + self.tif_taxon_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("Taxons", e.new), "value") + self.tif_id_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("IDs", e.new), "value") + self.tif_interpolation_method.param.watch(self._update_smoothing_options_tif, 'value') + + + + @try_catch("Error loading Individual IDs") + def load_ids_from_file(self, *events): + self.status_text = "Loading IDs..." + self.alert.object = self.status_text + file_path = self.local_ID_file.value + + if not file_path: + self.status_text = "No file selected." + self.alert.object = self.status_text + return + + try: + df = pd.read_csv(file_path) + df.columns = [re.sub(r"[-._\s]+", "_", col.lower()) for col in df.columns] # normalize + self.df = df + self._set_time_slider_from_df(df) + unique_ids = sorted(df["individual_local_identifier"].dropna().astype(str).unique()) + self.individual_ID.options = list(unique_ids) + self.individual_ID.disabled = False + + if "individual_taxon_canonical_name" in df.columns: + unique_taxa = sorted(df["individual_taxon_canonical_name"].dropna().astype(str).unique()) + self.taxon_name_val.options = list(unique_taxa) + self.taxon_name_val.disabled = False + self.status_text = f"Loaded {len(unique_ids)} Individual IDs and {len(unique_taxa)} Taxon names." + else: + self.status_text = f"Loaded {len(unique_ids)} Individual IDs. Column 'individual_taxon_canonical_name' not found." + + except Exception as e: + logger.exception("Error loading IDs") + self.status_text = f"Error: {e}" + + self.alert.object = self.status_text + + def update_individual_ids_by_taxon(self, event): + if self.df is None: + return + + selected_taxa = event.new + + if not selected_taxa: + unique_ids = sorted(self.df["individual_local_identifier"].dropna().astype(str).unique()) + self.individual_ID.options = list(unique_ids) + self.individual_ID.value = [] + else: + filtered_df = self.df[self.df["individual_taxon_canonical_name"].isin(selected_taxa)] + unique_ids = sorted(filtered_df["individual_local_identifier"].dropna().astype(str).unique()) + self.individual_ID.options = list(unique_ids) + self.individual_ID.value = list(unique_ids) + + def update_annotation_ids_by_taxon(self, event): + if self.df is None: + return + + selected_taxa = event.new + if not selected_taxa: + ids = sorted(self.df["individual_local_identifier"].dropna().astype(str).unique()) + else: + filtered = self.df[self.df["individual_taxon_canonical_name"].isin(selected_taxa)] + ids = sorted(filtered["individual_local_identifier"].dropna().astype(str).unique()) + + self.id_multiselect.options = ids + self.id_multiselect.value = ids + + @try_catch("Error generating CSV") + def run_make_csv(self, *events): + try: + individual_ids = self.individual_ID.value + csv_path = Path(self.local_ID_file.value) + interval_minutes = int(self.time_interval.value) + + start_time, end_time = self.time_selection_ID.value + start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S.%f") if not isinstance(start_time, str) else start_time + end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S.%f") if not isinstance(end_time, str) else end_time + + out_csv = self.out_csv_name.value + columns = validate_and_process_csv(csv_path) + + output_files = generate_individual_csvs_for_local_ids( + csv_file=csv_path, + ids=individual_ids, + start_time=start_time_str, + end_time=end_time_str, + interval_minutes=interval_minutes, + output_path_template=out_csv, + columns_to_interpolate=columns, + deployment_time_gap=int(self.deployment_time_gap.value), + min_expected_obs=int(self.min_expected_obs.value), + start_from_midnight=bool(self.start_from_midnight.value) + ) + + if self.merge_files.value: + merged_df = pd.concat([pd.read_csv(f) for f in output_files], ignore_index=True) + merged_output_path = out_csv.replace(".csv", "_merged.csv") + merged_df.to_csv(merged_output_path, index=False) + + if self.delete_individual_ID_files.value: + for f in output_files: + try: + Path(f).unlink() + except Exception as e: + logger.warning(f"Failed to delete {f}: {e}") + + self.status_text = f"Processing complete. Output saved to: {Path(out_csv).parent}" + except Exception as e: + logger.exception("Failed to generate CSV") + self.status_text = f"Failed: {e}" + + self.alert.object = self.status_text + + def _set_time_slider_from_df(self, df: pd.DataFrame): + # time column after name normalization + candidates = ("timestamp", "eobs_start_timestamp", "time", "datetime", "date") + time_col = next((c for c in candidates if c in df.columns), None) + if not time_col: + return + + ts = pd.to_datetime(df[time_col], errors="coerce") + ts = ts[ts.notna()] + if ts.empty: + return + + tmin = pd.Timestamp(ts.min()).to_pydatetime() + tmax = pd.Timestamp(ts.max()).to_pydatetime() + + # update the slider limits and values + self.time_selection_ID.start = tmin + self.time_selection_ID.end = tmax + self.time_selection_ID.value = (tmin, tmax) + + @try_catch("Error merging files from folder") + def run_merge_files(self, *events): + try: + folder_path = Path(self.folder_to_merge.value) + merged_df, deleted_columns = merge_csv_files_from_folder(folder_path, self.delete_empty_columns.value) + + merged_output_path = self.out_merged_csv_name.value + merged_df.to_csv(merged_output_path, index=False) + + deleted_msg = f"\nDeleted columns: {', '.join(deleted_columns)}" if deleted_columns else "\nNo columns deleted." + self.status_text = f"Merged CSV saved: {merged_output_path}{deleted_msg}" + except Exception as e: + logger.exception("Failed to merge files") + self.status_text = f"Failed: {e}" + + self.alert.object = self.status_text + + @try_catch("Error loading environmental data") + def load_env_data(self, *events): + """We select exactly one .nc, update File/Time/Spatial and the list of 3D variables.""" + self.status_text = "Loading environmental data..." + self.alert.object = self.status_text + + raw = self.env_data_selector.value + if not raw: + self.status_text = "Please select one .nc file." + self.alert.object = self.status_text + return + + # If the selector suddenly returns a list, we require exactly 1 + if isinstance(raw, (list, tuple, set)): + if len(raw) != 1: + self.status_text = "Select exactly one .nc file." + self.alert.object = self.status_text + return + nc_path = str(list(raw)[0]).strip() + else: + nc_path = str(raw).strip() + + if Path(nc_path).suffix.lower() != ".nc": + self.status_text = "Only .nc is supported on this tab." + self.alert.object = self.status_text + return + + # Update "File:" immediately + self._update_info_lines(self.env_info, {"File:": Path(nc_path).name}) + self._auto_height(self.env_info) + + var_file_map: dict[str, str] = {} + time_text = "-" + spatial_text = "-" + + # Auxiliary coordinate name candidates + time_candidates = ("time","Time","datetime","date","valid_time","forecast_time","verification_time") + lat_candidates = ("lat", "latitude", "y") + lon_candidates = ("lon", "longitude", "x") + + try: + ds = safe_open_nc_with_time_decoding(nc_path) + try: + # ---- TIME ---- + time_name = next((c for c in time_candidates if c in ds.coords or c in ds.variables), None) + if time_name is not None: + tmin = pd.to_datetime(ds[time_name].values.min()) + tmax = pd.to_datetime(ds[time_name].values.max()) + time_text = f"{tmin.strftime('%Y-%m-%d')} — {tmax.strftime('%Y-%m-%d')}" + + # ---- SPATIAL ---- + lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + if lat_name and lon_name: + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" + + # List of 3D variables (have at least 3 dimensions) - + for var in ds.data_vars: + da = ds[var] + if da.ndim >= 3: + var_file_map[var] = nc_path + finally: + ds.close() + except Exception as e: + self.status_text = f"Failed to open dataset: {e}" + self.alert.object = self.status_text + return + + # Update Time/Spatial information block + self._update_info_lines(self.env_info, { + "Time range:": time_text, + "Spatial range:": spatial_text + }) + self._auto_height(self.env_info) + + # Variable options + if not var_file_map: + self.env_data_multiselect.options = [] + self.status_text = "No 3D variables (e.g. time/lat/lon) found in the file." + self.alert.object = self.status_text + return + + self.env_variable_sources = var_file_map + self.env_data_multiselect.options = list(var_file_map.keys()) + self.status_text = f"Loaded {len(var_file_map)} variable(s) from 1 file." + self.alert.object = self.status_text + self._sync_nc_column_heights() + + @try_catch("Error loading boundary data") + def load_boundary_data(self, *events): + self.status_text = "Loading boundary data..." + self.alert.object = self.status_text + + file_input = self.bound_data_selector.value + + if not file_input: + self.status_text = "Please select one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + + # If multiple files are selected + if isinstance(file_input, list): + if len(file_input) != 1: + self.status_text = "Please select exactly one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + file_path = file_input[0] + else: + file_path = file_input + + try: + path, S, N, W, E = load_vector_extent_info(file_path) + self.boundary_path = path + self.boundary_info_str.object = ( + f"Boundary file: {Path(path).name}
" + f"Spatial range: lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + self.status_text = ( + f"Boundary loaded from {Path(path).name}: " + f"lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to read vector file: {e}" + self.alert.object = self.status_text + self._sync_nc_column_heights() + + @try_catch("Error loading movement data") + def load_movement_data(self, *events): + self.status_text = "Loading movement data..." + self.alert.object = self.status_text + + file_path = self.movement_data_selector.value + if not file_path: + self.status_text = "No movement file selected." + self.alert.object = self.status_text + return + + df, taxa, ids, err = load_taxa_and_ids_from_csv(file_path) + if err: + self.status_text = f"Error: {err}" + self.alert.object = self.status_text + return + + # normalize headings + df.columns = [re.sub(r"[-._\s]+", "_", col.lower()) for col in df.columns] + if "location_long" in df.columns and "location_lon" not in df.columns: + df["location_lon"] = df["location_long"] + self.df = df + self.id_multiselect.options = ids + self.id_multiselect.disabled = False + self.taxon_multiselect.options = taxa + self.taxon_multiselect.disabled = False + self.status_text = f"Loaded {len(ids)} IDs and {len(taxa)} taxon names." + cols = set(df.columns) + # TIME + time_col = next((c for c in ("timestamp","time","datetime","date") if c in cols), None) + ts = pd.to_datetime(df[time_col], errors="coerce") if time_col else None + time_text = "-" + if ts is not None and ts.notna().any(): + tmin, tmax = ts.min(), ts.max() + time_text = f"Time range: {tmin:%Y-%m-%d %H:%M:%S} — {tmax:%Y-%m-%d %H:%M:%S}" + + # SPATIAL + lat_col = next((c for c in ("location_lat","latitude","lat","y") if c in cols), None) + lon_col = next((c for c in ("location_lon","longitude","lon","x") if c in cols), None) + spatial_text = "-" + if lat_col and lon_col: + lat = pd.to_numeric(df[lat_col], errors="coerce") + lon = pd.to_numeric(df[lon_col], errors="coerce") + if lat.notna().any() and lon.notna().any(): + spatial_text = (f"Spatial range: " + f"lat[{float(lat.min()):.3f}..{float(lat.max()):.3f}], " + f"lon[{float(lon.min()):.3f}..{float(lon.max()):.3f}]") + + lines = (self.movement_info.object or + "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
").split("
") + for i, line in enumerate(lines): + if line.startswith("Time range:"): + lines[i] = time_text + if line.startswith("Spatial range:"): + lines[i] = spatial_text + self.movement_info.object = "
".join(lines) + + self.alert.object = self.status_text + self._sync_nc_column_heights() + + + @try_catch("Error during annotation") + def run_annotation(self, *events): + self.status_text = "Running annotation..." + self.alert.object = self.status_text + try: + selected_vars = self.env_data_multiselect.value + selected_ids = self.id_multiselect.value + env_var_map = getattr(self, "env_variable_sources", {}) + movebank_path = self.movement_data_selector.value + boundary_path = getattr(self, "boundary_path", None) + interpolation_method = self.interpolation_method.value + smoothing_points = int(self.control_smoothing.value) + + if not selected_vars: + self.status_text = "No environmental variables selected." + elif not selected_ids: + self.status_text = "No individual IDs selected." + elif not movebank_path: + self.status_text = "No Movebank data file selected." + else: + bbox = None + if not boundary_path: + # building boundaries with .nc + first_var = selected_vars[0] + nc_path = env_var_map.get(first_var) + if not nc_path: + self.status_text = "Cannot derive boundary: missing .nc path for selected variable." + self.alert.object = self.status_text + return + + try: + bounds = get_nc_bounds(nc_path) # {"S":..., "N":..., "W":..., "E":...} + bbox = bounds + # Updating the border information panel + self.boundary_info_str.object = ( + "Boundary file: not selected (auto from .nc)
" + f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " + f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to derive boundary from .nc: {e}" + self.alert.object = self.status_text + return + + self.status_text = "Annotation started." + # pass bbox (or None, if the user did choose shp) + start_annotation_process( + env_var_map, selected_vars, movebank_path, selected_ids, + boundary_path, interpolation_method, bbox=bbox, smoothing_k=smoothing_points, + out_csv_path=self.output_path.value + ) + self.status_text = "Annotation finished." + + except Exception as e: + self.status_text = f"Annotation failed: {e}" + + self.alert.object = self.status_text + + ####TIF + @try_catch("Error loading TIF environmental data") + def load_env_data_tif(self, *events): + """ + Load environmental data from an AppEEARS GeoTIFF folder, convert it to a + single multi-variable NetCDF, and populate the TIF tab UI. + + Workflow: + 1) Validate that the user selected any *.tif in the target folder. + 2) Ensure a Movebank CSV is already selected (used to decide output dir). + 3) Convert the set of TIFs in that folder → one NetCDF via + `convert_tif_to_nc_before_annotation` (each parsed variable = separate DataArray). + 4) Open the produced NetCDF with `safe_open_nc_with_time_decoding` and: + - extract Time range and Spatial extent, + - build `tif_env_var_map` ONLY for variables that are 3D and have a time dimension. + 5) Update the UI: + - Info panel (File/Time/Spatial), + - Multiselect options/values, + - Status text. + + Notes: + - The resulting `self.tif_env_var_map` is later used by `run_annotation_tif()` directly, + so we avoid re-reading all `data_vars` again. + - `self.tif_nc_path` is stored for fallbacks (e.g., bbox from nc if no boundary). + """ + # --- 0) Initial UI/status ---------------------------------------------------- + self.status_text = "Loading TIF environmental data..." + self.alert.object = self.status_text + + # --- 1) Validate a sample TIF and collect folder ----------------------------- + tif_sample_path = Path(getattr(self.tif_env_data_selector, "value", "") or "") + if (not tif_sample_path.is_file()) or (tif_sample_path.suffix.lower() != ".tif"): + self.status_text = f"Selected path is not a .tif file: {tif_sample_path}" + self.alert.object = self.status_text + return + + folder_path = tif_sample_path.parent + tif_files = sorted([str(p) for p in folder_path.glob("*.tif") if p.is_file()]) + if not tif_files: + self.status_text = f"No .tif files found in: {folder_path}" + self.alert.object = self.status_text + return + + # --- 2) Ensure Movebank CSV is loaded (for placing the output NetCDF nearby) - + movebank_path = getattr(self.tif_movement_data_selector, "value", None) + if not movebank_path or not Path(str(movebank_path)).is_file(): + self.status_text = "Please load Movebank data before environmental data." + self.alert.object = self.status_text + return + + output_dir = str(Path(str(movebank_path)).parent) + + # --- 3) Convert TIF stack → NetCDF ------------------------------------------ + try: + nc_path = convert_tif_to_nc_before_annotation(tif_files, output_dir) + except Exception as e: + self.status_text = f"Failed to convert TIF to NetCDF: {e}" + self.alert.object = self.status_text + return + + # Cache for later (bbox fallback, re-open, etc.) + self.tif_nc_path = nc_path + + # --- 4) Inspect NetCDF and keep ONLY 3D variables with a time dimension ------ + var_file_map: dict[str, str] = {} + time_text = "Time range: -" + spatial_text = "Spatial range: -" + + try: + ds = safe_open_nc_with_time_decoding(nc_path) + + # Time range (if present) + if ("time" in ds.coords) or ("time" in ds.variables): + try: + tmin = pd.to_datetime(ds["time"].values.min()) + tmax = pd.to_datetime(ds["time"].values.max()) + time_text = f"Time range: {tmin.strftime('%Y-%m-%d')} — {tmax.strftime('%Y-%m-%d')}" + except Exception: + # Keep default if something goes wrong + pass + + # Spatial extent (lat/lon candidates can vary) + lat_name = next((c for c in ("lat", "latitude", "y") if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in ("lon", "longitude", "x","long") if c in ds.coords or c in ds.variables), None) + if lat_name and lon_name: + try: + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + spatial_text = ( + f"Spatial range: lat[{lat_min:.3f}..{lat_max:.3f}], " + f"lon[{lon_min:.3f}..{lon_max:.3f}]" + ) + except Exception: + pass + + # Build map: ONLY variables that (a) have a 'time' dim and (b) are 3D or higher + var_names: list[str] = [] + for v in ds.data_vars: + da = ds[v] + if ("time" in da.dims) and (da.ndim >= 3): + var_file_map[v] = nc_path + var_names.append(v) + + except Exception as e: + self.status_text = f"Failed to open/inspect NetCDF: {e}" + self.alert.object = self.status_text + return + finally: + try: + ds.close() + except Exception: + pass + + # --- 5) Update UI: info panel, multiselect, status --------------------------- + # Info panel (use common helper to insert/replace rows) + self._update_info_lines(self.tif_env_info, { + "File:": Path(nc_path).name, + "Time range:": time_text.replace("Time range: ", ""), + "Spatial range:": spatial_text.replace("Spatial range: ", "") + }) + + if not var_file_map: + # No valid 3D variables (time/lat/lon) found + self.tif_env_var_map = {} + self.tif_env_data_multiselect.options = [] + self.tif_env_data_multiselect.value = [] + self.status_text = "No 3D (time/lat/lon) variables found in the generated NetCDF." + self.alert.object = self.status_text + return + + # Store already filtered variables for later use in run_annotation_tif() + self.tif_env_var_map = var_file_map + + # Options for the multiselect and a default value + self.tif_env_data_multiselect.options = var_names + if not self.tif_env_data_multiselect.value: + self.tif_env_data_multiselect.value = var_names[:1] + + # Final status + self.status_text = ( + f"Converted {len(tif_files)} TIF files to NetCDF. " + f"Variables (3D/time): {', '.join(var_names)}" + ) + self.alert.object = self.status_text + + @try_catch("Error running TIF annotation") + def run_annotation_tif(self, *events): + """ + Run annotation workflow for environmental data sourced from AppEEARS GeoTIFFs. + + Steps: + 1) Validate user selections (Movebank CSV, a sample TIF in the target folder, optional boundary). + 2) Gather all *.tif files from the selected folder. + 3) Convert the TIF stack to a single NetCDF via `convert_tif_to_nc_before_annotation` + (this function produces a Dataset with one DataArray per parsed variable). + 4) Read actual variable names from the produced NetCDF and construct `env_var_map` + as {var_name: nc_path}. + 5) Determine which variables to annotate (from the multiselect; default to the first one). + 6) Call `start_annotation_process(...)` with the resolved parameters. + + Notes: + - This function assumes that `convert_tif_to_nc_before_annotation`, `safe_open_nc_with_time_decoding`, + and `start_annotation_process` are already imported. + - It also assumes UI widgets exist on the instance: + * self.tif_movement_data_selector (file path to Movebank CSV) + * self.tif_env_data_selector (a sample TIF inside the desired folder) + * self.tif_env_data_multiselect (variable picker) + * self.id_multiselect or self.tif_id_multiselect (optional animal IDs) + * self.tif_bound_data_selector or self.bound_data_selector (optional boundary file) + * self.tif_interpolation_method or self.interpolation_method (method name) + * self.tif_output_path or self.output_path (optional output CSV path) + - Status messages are written to `self.status_text` and mirrored in `self.alert.object`. + """ + self.status_text = "Starting annotation (TIF)…" + self.alert.object = self.status_text + + # --- 0) Validate inputs --------------------------------------------------- + # Movebank CSV (required) + movebank_path = getattr(self.tif_movement_data_selector, "value", None) + if not movebank_path or not Path(str(movebank_path)).is_file(): + self.status_text = "Please load Movebank data before environmental data." + self.alert.object = self.status_text + return + + output_dir = str(Path(str(movebank_path)).parent) + + # Sample TIF file (to infer the target folder) + tif_sample = getattr(self, "tif_env_data_selector", None) + tif_sample = getattr(tif_sample, "value", None) + if not tif_sample or Path(tif_sample).suffix.lower() != ".tif": + self.status_text = "Please select a sample .tif file in the folder you want to annotate." + self.alert.object = self.status_text + return + + # Selected animal IDs (optional) + id_widget = getattr(self, "tif_id_multiselect", None)# or getattr(self, "id_multiselect", None) + selected_ids = list(getattr(id_widget, "value", [])) if id_widget else [] + if not selected_ids: + # Not critical—downstream may annotate all IDs or handle empty list. + print("[WARN] No IDs selected; proceeding without explicit ID filtering.") + + # Optional boundary + bound_widget = getattr(self, "tif_bound_data_selector", None)# or getattr(self, "bound_data_selector", None) + boundary_path = getattr(bound_widget, "value", None) + if boundary_path and not Path(boundary_path).is_file(): + print(f"[WARN] Boundary file not found: {boundary_path}. Proceeding without boundary.") + boundary_path = None + + # Interpolation and time-fit options (prefer TIF-tab widgets; fallback to NC-tab) + interp_widget = getattr(self, "tif_interpolation_method", None) + interp_method = getattr(interp_widget, "value", "Nearest neighbour (time-linear)") + + # Output CSV path (optional) + out_widget = getattr(self, "tif_output_path", None) + output_csv_path = getattr(out_widget, "value", None) + + # --- 1) Collect TIFs from the selected folder ----------------------------- + folder_path = Path(tif_sample).parent + tif_paths = sorted(p for p in folder_path.glob("*.tif") if p.is_file()) + if not tif_paths: + self.status_text = f"No .tif files found in: {folder_path}" + self.alert.object = self.status_text + return + + # --- 2) Convert TIF → NetCDF (multi-variable) ----------------------------- + output_dir = str(Path(movebank_path).parent) + nc_path = convert_tif_to_nc_before_annotation([str(p) for p in tif_paths], output_dir) + self.tif_nc_path = nc_path # cache for later use + + # --- 3) Read variables from NetCDF and build env_var_map ------------------ + # Prefer already-filtered map from load_env_data_tif (only 3D with 'time') + if getattr(self, "tif_env_var_map", None): + env_var_map = dict(self.tif_env_var_map) + var_names = list(env_var_map.keys()) + else: + # Fallback: inspect the .nc and keep only 3D with a time dim + env_var_map, var_names = {}, [] + try: + ds = safe_open_nc_with_time_decoding(nc_path) + try: + for v in ds.data_vars: + da = ds[v] + if ("time" in da.dims) and (da.ndim >= 3): + env_var_map[v] = nc_path + var_names.append(v) + finally: + ds.close() + except Exception as e: + self.status_text = f"Failed to read variables from NetCDF: {e}" + self.alert.object = self.status_text + return + + if not var_names: + self.status_text = "No 3D (time/lat/lon) variables found in the generated NetCDF." + self.alert.object = self.status_text + return + + + # --- 4) Which variables to annotate? -------------------------------------- + ms_widget = getattr(self, "tif_env_data_multiselect", None) + selected_vars = list(getattr(ms_widget, "value", [])) if ms_widget else [] + if not selected_vars: + selected_vars = var_names[:1] # default to the first variable + if ms_widget: + ms_widget.value = selected_vars # sync UI state + + # --- 5) Kick off annotation ------------------------------------------------ + self.status_text = ( + f"Annotating variables: {', '.join(selected_vars)} | " + f"IDs: {len(selected_ids) if selected_ids else 'all/unspecified'} | " + f"Interpolation: {interp_method}" + ) + self.alert.object = self.status_text + + try: + start_loading_spinner() + except Exception: + pass + + try: + # Auto-bbox from .nc if no boundary file selected + bbox = None + if not boundary_path: + try: + bounds = get_nc_bounds(self.tif_nc_path) # {"S","N","W","E"} + bbox = bounds + self.tif_boundary_info_str.object = ( + "Boundary file: not selected (auto from .nc)
" + f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " + f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + ) + except Exception: + pass + start_annotation_process( + env_var_map=env_var_map, + selected_env_vars=selected_vars, + movebank_path=str(movebank_path), + selected_ids=selected_ids, + boundary_path=str(boundary_path) if boundary_path else None, + interpolation_method=interp_method, + bbox=bbox, + smoothing_k=int(self.tif_control_smoothing.value), + out_csv_path=output_csv_path + ) + self.status_text = "Annotation finished successfully (TIF)." + self.alert.object = self.status_text + except Exception as e: + self.status_text = f"Annotation failed (TIF): {e}" + self.alert.object = self.status_text + print("[ERROR] Annotation failed (TIF):", e) + finally: + try: + stop_loading_spinner() + except Exception: + pass + + + @try_catch("Error loading TIF boundary data") + def load_boundary_data_tif(self, *events): + self.status_text = "Loading TIF boundary data..." + self.alert.object = self.status_text + + file_input = self.tif_bound_data_selector.value + if not file_input: + self.status_text = "Please select one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + + if isinstance(file_input, list): + if len(file_input) != 1: + self.status_text = "Please select exactly one vector file (.shp or .geojson)." + self.alert.object = self.status_text + return + file_path = file_input[0] + else: + file_path = file_input + + try: + path, S, N, W, E = load_vector_extent_info(file_path) + self.boundary_path = path + self.tif_boundary_info_str.object = ( + f"Boundary file: {Path(path).name}
" + f"Spatial range: lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + self.status_text = ( + f"TIF Boundary loaded: " + f"lat[{S:.3f}..{N:.3f}], lon[{W:.3f}..{E:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to read vector file: {e}" + self.alert.object = self.status_text + + @try_catch("Error loading TIF movement data") + def load_movement_data_tif(self, *events): + self.status_text = "Loading TIF movement data..." + self.alert.object = self.status_text + + file_path = self.tif_movement_data_selector.value + if not file_path: + self.status_text = "No TIF movement file selected." + self.alert.object = self.status_text + return + + df, taxa, ids, err = load_taxa_and_ids_from_csv(file_path) + if err: + self.status_text = f"Error: {err}" + else: + df.columns = [re.sub(r"[-._\s]+", "_", col.lower()) for col in df.columns] + if "location_long" in df.columns and "location_lon" not in df.columns: + df["location_lon"] = df["location_long"] + self.df = df # shared for both tabs + self.tif_id_multiselect.options = ids + self.tif_id_multiselect.disabled = False + self.tif_taxon_multiselect.options = taxa + self.tif_taxon_multiselect.disabled = False + self.status_text = f"TIF: Loaded {len(ids)} IDs and {len(taxa)} taxon names." + mv_current = self.tif_movement_info.object or "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -" + lines = mv_current.split("
") + if lines: + lines[0] = f"File: {Path(file_path).name}" + + # + try: + ts = pd.to_datetime(df["timestamp"], errors="coerce") + lat = pd.to_numeric(df["location_lat"], errors="coerce") + lon = pd.to_numeric(df["location_lon"], errors="coerce") + if ts.notna().any(): + tmin = ts.min().strftime("%Y-%m-%d %H:%M:%S") + tmax = ts.max().strftime("%Y-%m-%d %H:%M:%S") + for i, line in enumerate(lines): + if line.startswith("Time range:"): + lines[i] = f"Time range: {tmin} — {tmax}" + if lat.notna().any() and lon.notna().any(): + lat_min, lat_max = float(lat.min()), float(lat.max()) + lon_min, lon_max = float(lon.min()), float(lon.max()) + for i, line in enumerate(lines): + if line.startswith("Spatial range:"): + lines[i] = f"Spatial range: lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" + + except Exception: + pass + + self.tif_movement_info.object = "
".join(lines) + self.alert.object = self.status_text + + @try_catch("Interpolation (missing only) failed") + def run_interpolate_missing_only(self, *events): + # 1) input + csv_path = Path(self.local_ID_file.value) + if not csv_path.exists(): + self.status_text = "No file selected." + self.alert.object = self.status_text + return + + # 2) Determine the ID: if the user did not choose, we take all + if self.df is None: + try: + df_tmp = pd.read_csv(csv_path) + df_tmp.columns = [re.sub(r"[-._:\s]+", "_", c.lower()) for c in df_tmp.columns] + except Exception as e: + self.status_text = f"Failed to read CSV: {e}" + self.alert.object = self.status_text + return + all_ids = sorted(df_tmp.get("individual_local_identifier", pd.Series([], dtype=str)).dropna().astype(str).unique()) + else: + all_ids = sorted(self.df.get("individual_local_identifier", pd.Series([], dtype=str)).dropna().astype(str).unique()) + + selected_ids = list(self.individual_ID.value) if self.individual_ID.value else all_ids + if not selected_ids: + self.status_text = "No IDs to process." + self.alert.object = self.status_text + return + + # 3) Time range + start_time, end_time = self.time_selection_ID.value + start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S.%f") + end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S.%f") + + # 4) Which columns to interpolate: taken from your validating function + columns = validate_and_process_csv(csv_path) + + # 5) Call simplified interpolation + # if you replaced check_missing_values_only -> it now interpolates, + # otherwise import interpolate_missing_values_only and call it here. + out_template = self.out_csv_name.value + created = interpolate_missing_values_only( + start_time_str, end_time_str, csv_path, selected_ids, columns, out_template + ) + # or: + # created = interpolate_missing_values_only(...) + + # 6) result + if created: + self.status_text = f"Interpolation complete. Files: {len(created)}. Example: {created[0]}" + else: + self.status_text = "Interpolation complete. No files created (no eligible gaps ≤ 1 day)." + self.alert.object = self.status_text + + def update_annotation_ids_by_taxon_tif(self, event): + if self.df is None: + return + + selected_taxa = event.new + if not selected_taxa: + ids = sorted(self.df["individual_local_identifier"].dropna().astype(str).unique()) + else: + filtered = self.df[self.df["individual_taxon_canonical_name"].isin(selected_taxa)] + ids = sorted(filtered["individual_local_identifier"].dropna().astype(str).unique()) + + self.tif_id_multiselect.options = ids + self.tif_id_multiselect.value = ids + + def update_env_info_text(self, selected_vars): + current = self.env_info.object or "" + lines = current.split("
") + updated_lines = [] + found = False + for line in lines: + if "Environment parameters" in line: + updated_lines.append(f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + found = True + else: + updated_lines.append(line) + if not found: + updated_lines.insert(1, f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + self.env_info.object = "
".join(updated_lines) + + + def update_movement_info_text(self, section, new_values): + current = self.movement_info.object or "" + lines = current.split("
") + updated_lines = [] + for line in lines: + if section == "Taxons" and "Taxons" in line: + updated_lines.append(f"Taxons: {', '.join(new_values) if new_values else '-'}") + elif section == "IDs" and "IDs" in line: + updated_lines.append(f"IDs: {', '.join(new_values) if new_values else '-'}") + else: + updated_lines.append(line) + self.movement_info.object = "
".join(updated_lines) + + def update_env_info_text_tif(self, selected_vars): + current = self.tif_env_info.object or "" + if not current: + current = "File: not selected
Environment parameters: -
Time range: -
Spatial range: -
" + lines = current.split("
") + updated = [] + found = False + for line in lines: + if "Environment parameters" in line: + updated.append(f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + found = True + else: + updated.append(line) + if not found: + updated.insert(1, f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") + self.tif_env_info.object = "
".join(updated) + + def update_movement_info_text_tif(self, section, new_values): + current = self.tif_movement_info.object or "" + if not current: + current = "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
" + lines = current.split("
") + updated = [] + for line in lines: + if section == "Taxons" and "Taxons" in line: + updated.append(f"Taxons: {', '.join(new_values) if new_values else '-'}") + elif section == "IDs" and "IDs" in line: + updated.append(f"IDs: {', '.join(new_values) if new_values else '-'}") + else: + updated.append(line) + self.tif_movement_info.object = "
".join(updated) + + + def _update_info_lines(self, pane, changes: dict): + """ + Safely updates rows in pane.object by tags: + changes = {"File:": "...", "Time range:": "...", "Spatial range:": "...", "Environment parameters:": "..."} + If the row with the tag does not exist, it is added. + """ + default = "File: not selected
Environment parameters: -
Time range: -
Spatial range: -
" + current = pane.object or default + lines = current.split("
") + idx = {} + for i, line in enumerate(lines): + for key in changes.keys(): + if line.strip().startswith(key): + idx[key] = i + + for key, val in changes.items(): + if key in idx: + lines[idx[key]] = f"{key} {val}" + else: + # insert at the end before the empty last one, if there is one + insert_pos = len(lines) - 1 if lines and lines[-1] == "" else len(lines) + lines.insert(insert_pos, f"{key} {val}") + + pane.object = "
".join(lines) + + + def _section(self, title, *items, height=None): + body = pn.Column(*items, sizing_mode="stretch_width") + + return pn.Card( + body, + title=title, + collapsible=False, + margin=(0, 0, 10, 0), + sizing_mode="stretch_width", + height=height, + ) + + def _auto_height(self, pane, line_px=22, padding=8): + lines = [l for l in (pane.object or "").split("
") if l.strip()] + pane.height = line_px * max(1, len(lines)) + padding + + def _update_smoothing_options(self, event): + """Updates options for control_smoothing depending on interpolation method (.nc).""" + if event.new.startswith("Nearest neighbour"): + self.control_smoothing.options = ["1"] + self.control_smoothing.value = "1" + else: + self.control_smoothing.options = ["2", "4", "6", "8"] + if self.control_smoothing.value == "1": + self.control_smoothing.value = "4" + + def _update_smoothing_options_tif(self, event): + """Updates options for control_smoothing depending on interpolation method(.tif).""" + if event.new.startswith("Nearest neighbour"): + self.tif_control_smoothing.options = ["1"] + self.tif_control_smoothing.value = "1" + else: + self.tif_control_smoothing.options = ["2", "4", "6", "8"] + if self.tif_control_smoothing.value == "1": + self.tif_control_smoothing.value = "4" + + def _sync_nc_column_heights(self): + """Adjusts the height of the 2nd and 3rd columns to the 1st.""" + first = getattr(self, "_nc_col1", None) + second = getattr(self, "_nc_col2", None) + third = getattr(self, "_nc_col3", None) + if not first or not second or not third: + return + + if first.height is None: + pn.state.onload(lambda: self._apply_nc_height_from_first()) + else: + self._apply_nc_height_from_first() + + def _apply_nc_height_from_first(self): + first = self._nc_col1 + if not first: + return + h = first.height + if h is None: + return + self._nc_col2.height = h + self._nc_col3.height = h + + def reset_boundary_data(self, *events): + """ + Resets boundary to default: no file selected, range = environment boundary (.nc). + Also clears self.boundary_path so annotation goes back to 'auto from .nc' mode. + """ + self.boundary_path = None + default_nc = "Boundary file: not selected
Spatial range: = environment data boundary" + default_tif = "Boundary file: not selected
Spatial range: = environment data boundary" + try: + self.boundary_info_str.object = default_nc + except Exception: + pass + try: + self.tif_boundary_info_str.object = default_tif + except Exception: + pass + + self.status_text = "Boundary reset to default (auto from .nc)." + self.alert.object = self.status_text + + self._sync_nc_column_heights() + +@register_view() +def view(): + viewer = movebank_annotation_engine() + template = DEFAULT_TEMPLATE(main=[viewer.alert, viewer.view]) + return template + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + +if __name__.startswith("bokeh"): + view() \ No newline at end of file diff --git a/ecodata/movebank_functions.py b/ecodata/movebank_functions.py new file mode 100644 index 0000000..1d4a4fb --- /dev/null +++ b/ecodata/movebank_functions.py @@ -0,0 +1,976 @@ +""" +movebank_functions.py + +Processes Movebank CSV datasets using "timestamp" or "eobs:start-timestamp" column, +filter data by "individual-taxon-canonical-name" and "individual-local-identifier". + +Interpolation is always performed first using a 1-minute interval. This produces a regularly spaced time series. + +After interpolation, optional averaging is performed over a user-defined interval (e.g. 30 minutes). +Only numeric columns (such as 'eobs:temperature', 'ground-speed', 'height-above-ellipsoid') are averaged. + +All non-numeric columns (e.g. metadata or identifiers) are forward-filled from the last known value +during interpolation and retained without modification during averaging. +""" + +import csv +from datetime import datetime, timedelta +import pandas as pd +from pathlib import Path +import numpy as np +import re + +TIME_COLUMN = 'timestamp' # Set to "eobs:start-timestamp" or "timestamp" as needed + +# --- Utilities --- +from datetime import datetime +import pandas as pd +import re + +def parse_timestamp(s: str) -> datetime: + """ + Robust timestamp parser: + - Keeps backward compatibility with ISO-like strings: 'YYYY-MM-DD HH:MM:SS[.ffffff]' + - Supports day-first formats: 'DD.MM.YYYY HH:MM', 'DD.MM.YYYY HH:MM:SS[.ffffff]' + - Accepts 'T' separator and 'Z' / timezone offsets (drops tzinfo → naive) + - Pads/truncates fractional seconds to 6 digits when present + """ + if s is None: + raise ValueError("Timestamp is None") + + s = str(s).strip() + if not s: + raise ValueError("Empty timestamp") + + # --- Fast path: ISO with optional 'Z' or offset --- + # Example: 2020-01-02T03:04:05.123Z, 2020-01-02 03:04:05.123456+02:00 + iso_candidate = s + if iso_candidate.endswith("Z"): + iso_candidate = iso_candidate[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(iso_candidate.replace("T", " ")) + # Drop tzinfo to keep backward-compatible naive datetimes + if dt.tzinfo is not None: + dt = dt.replace(tzinfo=None) + return dt + except Exception: + pass + + # --- Normalize fractional seconds to <= 6 digits (microseconds) --- + # Works for both 'YYYY-MM-DD ...' and 'DD.MM.YYYY ...' + def _normalize_frac(text: str) -> str: + # split timezone if any to avoid touching the offset part + tz_match = re.search(r'([+-]\d{2}:\d{2}|[+-]\d{4})$', text) + tz = tz_match.group(0) if tz_match else "" + core = text[: -len(tz)] if tz else text + + if '.' in core: + head, frac = core.split('.', 1) + # cut off any trailing timezone-like part accidentally captured + frac = re.split(r'([+-]\d{2}:\d{2}|[+-]\d{4})', frac)[0] + frac = re.sub(r'\D', '', frac) # keep only digits + if len(frac) > 6: + frac = frac[:6] + elif 0 < len(frac) < 6: + frac = frac.ljust(6, '0') + core = f"{head}.{frac}" + return core + tz + + s_norm = _normalize_frac(s) + + # --- Try explicit known formats (old + new) --- + fmts = [ + # legacy ISO-like (kept first for backward compatibility) + "%Y-%m-%d %H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + # day-first variants + "%d.%m.%Y %H:%M:%S.%f", + "%d.%m.%Y %H:%M:%S", + "%d.%m.%Y %H:%M", + # allow 'T' separator explicitly + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M", + ] + for fmt in fmts: + try: + return datetime.strptime(s_norm, fmt) + except Exception: + continue + + # --- Last resort: pandas inference (dayfirst=True) --- + dt = pd.to_datetime(s_norm, dayfirst=True, errors="coerce", utc=False) + if pd.isna(dt): + raise ValueError(f"Unparsable timestamp: {s}") + # Convert pandas Timestamp to naive datetime (drop tz if any) + py_dt = dt.to_pydatetime() + if hasattr(py_dt, "tzinfo") and py_dt.tzinfo is not None: + py_dt = py_dt.replace(tzinfo=None) + return py_dt + + +def safe_float(value): + """Safely converts a value to float. + Handles None, empty strings, and strips whitespace. + + Args: + value (str or float): Input value. + + Returns: + float or None: Parsed float or None if conversion fails. + """ + if isinstance(value, float) or value is None: + return value + try: + return float(value.strip()) if value.strip() else None + except ValueError: + return None + +# --- Interpolation --- +def interpolate_points(start, end, interval, columns_to_interpolate): + """Generates linearly interpolated points between two observations. + + Args: + start (dict): The first observation row. + end (dict): The second observation row. + interval (timedelta): Interval at which to interpolate (e.g. 1 minute). + columns_to_interpolate (list): List of column names to interpolate. + + Returns: + list: A list of interpolated rows (dicts) between start and end. + """ + + start_time = parse_timestamp(start["timestamp"]) + end_time = parse_timestamp(end["timestamp"]) + + if start_time >= end_time: + return [] + + total_seconds = (end_time - start_time).total_seconds() + step_seconds = interval.total_seconds() + steps = int(total_seconds // step_seconds) + + if steps < 1: + return [] + + timestamps = [ + (start_time + timedelta(seconds=i * step_seconds)).strftime("%Y-%m-%d %H:%M:%S.%f")[:23] + for i in range(1, steps + 1) + ] + + alphas = np.linspace(1 / steps, 1.0, num=steps) + + interpolated_rows = [] + for idx, alpha in enumerate(alphas): + point = dict(start) + point["timestamp"] = timestamps[idx] + for col in columns_to_interpolate: + v_start = safe_float(start.get(col)) + v_end = safe_float(end.get(col)) + if v_start is not None and v_end is not None: + point[col] = v_start + alpha * (v_end - v_start) + else: + point[col] = None + interpolated_rows.append(point) + + return interpolated_rows + +# --- Fill Missing Data --- +def fill_missing_data(data): + """Fill missing lon/lat via linear interpolation between bounding points. + Uses actual lon/lat column names resolved from the data header. + Writes results back into the *same* lon/lat columns. + """ + if not data: + return data + + # derive header from the first row and resolve actual lon/lat keys + fieldnames = list(data[0].keys()) + lon_key, lat_key = resolve_lon_lat_keys(fieldnames) + id_key_in = resolve_id_key(fieldnames) + # if cannot resolve — nothing to do safely + if not lon_key or not lat_key: + return data + + i = 0 + while i < len(data): + # seek a block of rows where either lon or lat is missing + if data[i].get(lon_key) is None or data[i].get(lat_key) is None: + start_idx = i - 1 + while i < len(data) and (data[i].get(lon_key) is None or data[i].get(lat_key) is None): + i += 1 + end_idx = i + + # interpolate only if both ends exist + if 0 <= start_idx < len(data) and end_idx < len(data): + start = data[start_idx] + end = data[end_idx] + start_time = parse_timestamp(start["timestamp"]) + end_time = parse_timestamp(end["timestamp"]) + total_seconds = (end_time - start_time).total_seconds() or 0.0 + if total_seconds <= 0: + continue + + for j in range(start_idx + 1, end_idx): + current_time = parse_timestamp(data[j]["timestamp"]) + alpha = (current_time - start_time).total_seconds() / total_seconds + if start.get(lon_key) is not None and end.get(lon_key) is not None: + data[j][lon_key] = start[lon_key] + alpha * (end[lon_key] - start[lon_key]) + if start.get(lat_key) is not None and end.get(lat_key) is not None: + data[j][lat_key] = start[lat_key] + alpha * (end[lat_key] - start[lat_key]) + else: + i += 1 + + return data + +# --- Averaging --- +def average_by_time_interval(data, interval, columns_to_interpolate, actual_start_time, actual_end_time, allow_single=True): + """Averages numeric values over fixed time intervals. + + Args: + data (list of dict): Interpolated time series. + interval (timedelta): Averaging interval (e.g. 30 minutes). + columns_to_interpolate (list): Numeric columns to average. + actual_start_time (datetime): Start of valid time window. + actual_end_time (datetime): End of valid time window. + allow_single (bool): Whether to keep single-record intervals. + + Returns: + list of dict: Averaged records by time interval. + """ + + if not data: + return [] + + df = pd.DataFrame(data).copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce') + df = df.dropna(subset=["timestamp"]) + df = df.sort_values("timestamp") + + interval_minutes = int(interval.total_seconds() // 60) + df["interval_start"] = df["timestamp"].dt.floor(f"{interval_minutes}T") + grouped = df.groupby("interval_start") + + numeric_cols = [] + for col in columns_to_interpolate: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce') + numeric_cols.append(col) + + result = grouped[numeric_cols].mean() if numeric_cols else pd.DataFrame(index=grouped.size().index) + metadata_cols = [col for col in df.columns if col not in numeric_cols + ["timestamp", "interval_start"]] + for col in metadata_cols: + result[col] = grouped[col].first() + + result = result.reset_index() + result = result.rename(columns={"interval_start": "timestamp"}) + + if not allow_single: + group_sizes = grouped.size() + valid_groups = group_sizes[group_sizes > 1].index + result = result[result["timestamp"].isin(valid_groups)] + + result["timestamp"] = result["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S.%f").str[:23] + return result.to_dict(orient="records") + +# --- Validation --- +def validate_and_process_csv(file_path): + """ + Inspect a Movebank CSV header and return a list of ORIGINAL column names + that are suitable for interpolation/averaging. + + - Robust to header variations: '-', '_', '.', ':' are treated equally. + - Picks synonyms for lon/lat and common numeric fields (e.g., eobs:temperature). + - Time/ID columns are detected but EXCLUDED from the returned list. + - Returns ORIGINAL header names (exactly as in the file). + + Returns + ------- + list[str] + Ordered list of present columns to be used as numeric candidates for + interpolation/averaging (e.g., ['location_lon', 'location_lat', 'eobs:temperature', ...]). + """ + + def _norm(s: str) -> str: + # normalize header keys: "EOBS:Temperature" -> "eobs_temperature" + return re.sub(r"[-:.\s]+", "_", str(s).lower()).strip("_") + + # 1) read header + try: + with open(Path(file_path), "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + raw_fields = reader.fieldnames or [] + except Exception as e: + print(f"[validate_and_process_csv] Failed to read header: {e}") + return [] + + if not raw_fields: + return [] + + # 2) build normalized->original map + norm_to_orig = {} + for col in raw_fields: + nk = _norm(col) + # keep the first occurrence to preserve a stable, human header where possible + if nk not in norm_to_orig: + norm_to_orig[nk] = col + + present = set(norm_to_orig.keys()) + + # 3) define synonym groups + time_syns = [ + "timestamp", "eobs_start_timestamp", "eobs:start-timestamp", + "datetime", "date_time", "date", "time" + ] + id_syns = [ + "individual_local_identifier", "individual-local-identifier" + ] + lon_syns = [ + "location_long", "location_lon", "location-long", "location-lon", + "longitude", "lon", "location_longitude", "location.longitude" + ] + lat_syns = [ + "location_lat", "location-lat", + "latitude", "lat", "location_latitude", "location.latitude" + ] + # common numeric fields you typically interpolate/average + temp_syns = ["eobs_temperature", "eobs:temperature", "temperature"] + gspeed_syns = ["ground_speed", "ground-speed", "speed_2d", "speed"] + hae_syns = [ + "height_above_ellipsoid", "height-above-ellipsoid", + "gps_altitude", "altitude", "altitude_above_sea_level" + ] + + def _pick_first(syns): + for s in syns: + nk = _norm(s) + if nk in present: + return norm_to_orig[nk] + return None + + # 4) choose actual originals (if present) + time_col = _pick_first(time_syns) # not returned, for info/exclusion only + id_col = _pick_first(id_syns) # not returned + + lon_col = _pick_first(lon_syns) + lat_col = _pick_first(lat_syns) + temp_col = _pick_first(temp_syns) + gs_col = _pick_first(gspeed_syns) + hae_col = _pick_first(hae_syns) + + # 5) build the result list (keep a sensible order: coords first) + result = [] + for c in (lon_col, lat_col, temp_col, gs_col, hae_col): + if c and c not in result: + result.append(c) + + # You may also include any additional numeric columns here if you wish: + # e.g., any column whose normalized name starts with "eobs_" and is present. + # Just make sure to exclude time/id-like names: + time_like = {_norm(x) for x in time_syns} + id_like = {_norm(x) for x in id_syns} + + for nk, orig in norm_to_orig.items(): + if nk in time_like or nk in id_like: + continue + # already included? + if orig in result: + continue + # optional heuristic: include other eobs:* numeric-looking fields + if nk.startswith("eobs_"): + result.append(orig) + + return result + +# --- Main Processing --- +def process_csv_interp_or_averaging(start_time_str, end_time_str, interval_minutes, + csv_file, output_csv, local_identifier, + columns_to_interpolate=None, allow_single=True, + deployment_time_gap=60, min_expected_obs=1, + start_from_midnight=False): + """Processes a single individual's movement data with interpolation and optional averaging. + Includes filtering by time and ID, interpolation, averaging, session splitting, and final cleanup. + + Args: + start_time_str (str): Start datetime string. + end_time_str (str): End datetime string. + interval_minutes (int): Time step for averaging. + csv_file (Path): Path to input Movebank CSV. + output_csv (str): Output file path template. + local_identifier (str): Individual ID to process. + columns_to_interpolate (list): Columns to interpolate. + allow_single (bool): Keep intervals with one record. + deployment_time_gap (int): Maximum gap (min) to split sessions. + min_expected_obs (int): Minimum points required to keep a session. + start_from_midnight (bool): If True, truncate to 00:00 and start from it. + + Returns: + list: List of generated CSV file paths. + """ + if columns_to_interpolate is None: + columns_to_interpolate = [] + columns_to_interpolate = [col for col in columns_to_interpolate if col not in ("timestamp", "eobs:start-timestamp")] + + start_time = parse_timestamp(start_time_str) + end_time = parse_timestamp(end_time_str) + interval = timedelta(minutes=interval_minutes) + min_interval = timedelta(minutes=1) + + data = [] + with open(csv_file, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + time_key_in = resolve_time_column(fieldnames) + lon_key, lat_key = resolve_lon_lat_keys(fieldnames) + id_key_in = resolve_id_key(fieldnames) + for row in reader: + try: + ts_raw = row.get(time_key_in) + if not ts_raw: + continue + row_time = parse_timestamp(ts_raw) + except Exception: + continue + + row_id = row.get(id_key_in) if id_key_in else None + if row_id is None: + continue + + row_id_str = str(row_id).strip() + expected_id_str = str(local_identifier).strip() + + if row_id_str != expected_id_str: + continue + + if start_time <= row_time <= end_time: + row["timestamp"] = ts_raw + data.append(row) + + if len(data) < 2: + print("Not enough data after filtering.") + return [] + + data.sort(key=lambda x: parse_timestamp(x["timestamp"])) + + # Cut off at midnight and insert 00:00:00 + if start_from_midnight and data: + first_time = parse_timestamp(data[0]["timestamp"]) + midnight = first_time.replace(hour=0, minute=0, second=0, microsecond=0) + + # Cut off points to 00:00:00 + data = [row for row in data if parse_timestamp(row["timestamp"]) >= midnight] + + # If there is no exact point 00:00:00 — insert an artificial one + if data and parse_timestamp(data[0]["timestamp"]) > midnight: + clone = dict(data[0]) + clone["timestamp"] = midnight.strftime("%Y-%m-%d %H:%M:%S.%f")[:23] + for col in columns_to_interpolate: + if col in clone: + clone[col] = clone[col] # copy the value from the first real point + data.insert(0, clone) + + if len(data) < 2: + print("Not enough data after start_from_midnight filtering.") + return [] + + for col in ["timestamp"] + columns_to_interpolate: + if col not in fieldnames: + fieldnames.append(col) + + data = fill_missing_data(data) + + def split_into_sessions(data, max_gap_minutes): + max_gap = timedelta(minutes=max_gap_minutes) + sessions = [] + current_session = [] + + for i, row in enumerate(data): + if i == 0: + current_session.append(row) + continue + + prev_time = parse_timestamp(data[i-1]['timestamp']) + curr_time = parse_timestamp(row['timestamp']) + if curr_time - prev_time > max_gap: + if current_session: + sessions.append(current_session) + current_session = [row] + else: + current_session.append(row) + + if current_session: + sessions.append(current_session) + return sessions + + sessions = split_into_sessions(data, deployment_time_gap) + result_paths = [] + + for idx, session in enumerate(sessions): + if len(session) < min_expected_obs: + print(f"Skipping session {idx+1} with only {len(session)} observations (less than min_expected_obs={min_expected_obs})") + continue + + interpolated_rows = [] + for i in range(len(session) - 1): + interpolated_rows.append(session[i]) + interpolated_rows.extend(interpolate_points(session[i], session[i + 1], min_interval, columns_to_interpolate)) + if session: + interpolated_rows.append(session[-1]) + + if interval.total_seconds() > 60: + result_rows = average_by_time_interval( + interpolated_rows, interval, columns_to_interpolate, + actual_start_time=parse_timestamp(session[0]['timestamp']), + actual_end_time=parse_timestamp(session[-1]['timestamp']), + allow_single=allow_single + ) + else: + result_rows = interpolated_rows + + start_str = session[0]['timestamp'].replace(":", "-").replace(" ", "T")[:16] + end_str = session[-1]['timestamp'].replace(":", "-").replace(" ", "T")[:16] + session_output_path = output_csv.replace(".csv", f"__{start_str}_to_{end_str}.csv") + + with open(session_output_path, "w", newline='', encoding="utf-8") as f: + if "individual-local-identifier-deployment-time" not in fieldnames: + fieldnames.append("individual-local-identifier-deployment-time") + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for row in result_rows: + ts = row.get("timestamp") + if ts is None: + continue + row["timestamp"] = str(ts)[:23] + row["individual-local-identifier-deployment-time"] = Path(session_output_path).stem + writer.writerow(row) + result_paths.append(session_output_path) + try: + df_check = normalize_column_names(pd.read_csv(session_output_path, low_memory=False)) + ... + if numeric_cols_to_fix: + ... + df_check.to_csv(session_output_path, index=False) + print(f"Interpolated missing values in: {numeric_cols_to_fix} for file {session_output_path}") + except Exception as e: + print(f"Interpolation post-check failed for {session_output_path}: {e}") + + # Checking and interpolating NaN after writing + cols_to_check_for_nan = [ + "timestamp", "location_long", "location_lat", + "eobs_start_timestamp", "eobs_temperature", + "ground_speed", "height_above_ellipsoid" + ] + try: + df_check = normalize_column_names(pd.read_csv(session_output_path, low_memory=False)) + numeric_cols_to_fix = [col for col in cols_to_check_for_nan if col in df_check.columns and df_check[col].dtype in ["float64", "int64"] and df_check[col].isna().any()] + + if numeric_cols_to_fix: + df_check["timestamp"] = pd.to_datetime(df_check["timestamp"], errors="coerce") + df_check = df_check.set_index("timestamp") + df_check[numeric_cols_to_fix] = df_check[numeric_cols_to_fix].interpolate(method="time", limit_direction="both") + df_check = df_check.reset_index() + df_check.to_csv(session_output_path, index=False) + print(f"Interpolated missing values in: {numeric_cols_to_fix} for file {session_output_path}") + except Exception as e: + print(f"Interpolation post-check failed for {session_output_path}: {e}") + + result_paths.append(session_output_path) + print(f"Subset {session_output_path} has been created.") + + return result_paths + +# --- Merging --- +def merge_csv_files_from_folder(folder_path: Path, delete_empty_columns: bool) -> (pd.DataFrame, list): + """Merges multiple CSV files into one DataFrame. + Optionally deletes columns that are not shared across files. + + Args: + folder_path (Path): Directory containing CSV files. + delete_empty_columns (bool): If True, remove non-overlapping columns. + + Returns: + tuple: (merged DataFrame, list of removed column names) + """ + + csv_files = sorted(folder_path.glob("*.csv")) + if not csv_files: + raise ValueError("No CSV files found in the selected folder.") + dataframes = [normalize_column_names(pd.read_csv(f)) for f in csv_files] + all_columns = set() + for df in dataframes: + all_columns.update(df.columns) + missing_columns = {col for col in all_columns if any(col not in df.columns for df in dataframes)} + if delete_empty_columns and missing_columns: + cleaned_dataframes = [df.drop(columns=list(missing_columns), errors='ignore') for df in dataframes] + merged_df = pd.concat(cleaned_dataframes, ignore_index=True) + else: + merged_df = pd.concat(dataframes, ignore_index=True) + return merged_df, sorted(missing_columns) + +# --- Filename --- +def safe_filename(name: str, replacement: str = "_") -> str: + """Generates a filesystem-safe filename by replacing invalid characters. + + Args: + name (str): Original filename string. + replacement (str): Replacement for invalid characters. + + Returns: + str: Sanitized filename. + """ + return re.sub(r'[\\/:*?"<>| ]+', replacement, name).strip() + +# --- Batch --- +def generate_individual_csvs_for_local_ids(csv_file: Path, ids: list, + start_time, end_time, interval_minutes: int, + output_path_template: str, columns_to_interpolate: list, + deployment_time_gap: int = 60, + min_expected_obs: int = 100, + start_from_midnight = False) -> list: + """Processes movement data for multiple individuals into separate files. + + Calls process_csv_interp_or_averaging for each ID and aggregates results. + + Args: + csv_file (Path): Input CSV file path. + ids (list of str): List of local identifiers (tags). + start_time (str): Start datetime string. + end_time (str): End datetime string. + interval_minutes (int): Time step in minutes. + output_path_template (str): Base output path for naming files. + columns_to_interpolate (list): Columns for interpolation. + deployment_time_gap (int): Max gap in minutes to split sessions. + min_expected_obs (int): Minimum observations per session. + start_from_midnight (bool): If True, truncate sessions to start at 00:00. + + Returns: + list: List of output file paths. + """ + output_files = [] + for id in ids: + save_name_by_ID = safe_filename(id) + output = output_path_template.replace(".csv", f"_{save_name_by_ID}.csv") + result_paths = process_csv_interp_or_averaging( + start_time_str=start_time, + end_time_str=end_time, + interval_minutes=interval_minutes, + csv_file=csv_file, + output_csv=output, + local_identifier=id, + columns_to_interpolate=columns_to_interpolate, + deployment_time_gap=deployment_time_gap, + min_expected_obs=min_expected_obs, + start_from_midnight=start_from_midnight + ) + output_files.extend(result_paths) + return output_files + +def interpolate_missing_values_only(start_time_str: str, + end_time_str: str, + csv_file: Path, + ids: list, + columns_to_interpolate: list, + output_path_template: str, + max_gap_minutes: int = 24*60) -> list: + """ + Fill-in missing numeric values *within existing rows only* (no new rows created), + using time-based interpolation limited to gaps ≤ `max_gap_minutes` between two + known observations. Interpolation is performed independently per Individual ID. + + This function is designed for the "Simple interpolation (missing ≤ 1 day)" button: + - It does NOT build a regular 1-minute timeline. + - It only fills NaNs that lie strictly between two valid values where the total + time span between those two values is ≤ `max_gap_minutes`. + - It preserves original column names and writes timestamps back into the original + time column (if it exists), otherwise creates one. + + Parameters + ---------- + start_time_str : str + Start of the time window (string; parsed by `parse_timestamp`). + end_time_str : str + End of the time window (string; parsed by `parse_timestamp`). + csv_file : Path + Path to the input Movebank CSV. + ids : list + List of `individual-local-identifier` values to process independently. + columns_to_interpolate : list + Candidate columns for interpolation (original headers as in CSV). + Time-like columns (e.g., 'timestamp', 'eobs:start-timestamp') are ignored. + output_path_template : str + Template for output CSV path; per-ID files are created by appending + `_{safe_id}__interp_inplace_le1d.csv` before the ".csv" suffix. + max_gap_minutes : int, default 24*60 + Maximum allowed gap (in minutes) between two valid values to fill NaNs inside. + + Returns + ------- + list of str + Paths to the created per-ID CSV files. + + Notes + ----- + - Time parsing relies on `parse_timestamp`, which should support both ISO-like + and 'DD.MM.YYYY HH:MM[:SS[.fff]]' formats (and possibly 'T'/'Z'/offsets). + - The function matches the time & ID columns via *normalized* header keys, + but preserves original headers in the written output. + """ + + # --- Helpers (scoped locally to avoid polluting the module namespace) ---------- + def _norm_key(s: str) -> str: + """Normalize a single header key to a canonical form.""" + return re.sub(r"[-:.\s]+", "_", str(s).lower()).strip("_") + + def _norm_keys(d: dict) -> dict: + """Normalize all keys in a row (dict) for robust lookup; values unchanged.""" + return {_norm_key(k): v for k, v in d.items()} + + def _pick_time_col_from_df(df: pd.DataFrame) -> str: + """ + Choose which original column in df should store timestamps in the output. + Preference order: + 1) TIME_COLUMN (global) if present (matching by normalized name), + 2) 'timestamp', + 3) 'eobs_start_timestamp', + 4) 'time', 'datetime', 'date'. + Returns the *original* column name if found; otherwise returns TIME_COLUMN + (creating it later if missing). + """ + # Map normalized -> original + colmap = {_norm_key(c): c for c in df.columns} + + # TIME_COLUMN may be 'timestamp' or 'eobs:start-timestamp', etc. + time_key_norm = _norm_key(TIME_COLUMN) + candidates_norm = [ + time_key_norm, + "timestamp", + "eobs_start_timestamp", + "time", + "datetime", + "date", + ] + for nk in candidates_norm: + if nk in colmap: + return colmap[nk] + # fallback: use the global TIME_COLUMN string as-is + return TIME_COLUMN + + # --- Parse time window --------------------------------------------------------- + start_time = parse_timestamp(start_time_str) + end_time = parse_timestamp(end_time_str) + created_paths: list[str] = [] + + # --- Build a set of time-like normalized names to exclude from interpolation --- + time_like_norm = {"timestamp", "eobs_start_timestamp", "time", "datetime", "date"} + + # Prepare normalized view of the interpolation column list (but we will keep + # original names when writing to CSV) + + for local_id in ids: + rows = [] + dts = [] # parsed datetimes aligned with `rows` + fieldnames = None + + # --- Read only the current ID and time range ------------------------------ + with open(csv_file, "r", encoding="utf-8") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames or [] + time_key_in = resolve_time_column(fieldnames) + #lon_key, lat_key = resolve_lon_lat_keys(fieldnames) + for row in reader: + norm = _norm_keys(row) + + # Time value: prefer TIME_COLUMN, then common alternates + time_key_norm = _norm_key(TIME_COLUMN) + ts_str = norm.get(_norm_key(time_key_in)) \ + or norm.get("timestamp") \ + or norm.get("eobs_start_timestamp") \ + or norm.get("time") \ + or norm.get("datetime") \ + or norm.get("date") + if not ts_str: + continue + try: + t = parse_timestamp(ts_str) + except Exception: + # Skip rows with unparsable timestamps + continue + + # ID filter + rid = (norm.get("individual_local_identifier") or "").strip() + if rid != str(local_id).strip(): + continue + + # Time window filter + if not (start_time <= t <= end_time): + continue + + rows.append(row) # keep original headers/values + dts.append(t) + + # If nothing matched for this ID: skip + if not rows: + continue + + # --- Build DataFrame preserving original headers -------------------------- + df = pd.DataFrame(rows) + df["__dt"] = pd.to_datetime(dts) # already parsed, but ensure dtype + df = df.sort_values("__dt").set_index("__dt") + + # --- Interpolate each numeric column within allowed gaps ------------------- + # Keep only columns explicitly requested AND present in df, excluding any time-like + cols_to_fill = [] + for c in (columns_to_interpolate or []): + if c not in df.columns: + continue + if _norm_key(c) in time_like_norm: + continue + cols_to_fill.append(c) + + if cols_to_fill: + idx = df.index + max_gap = pd.Timedelta(minutes=max_gap_minutes) + + for col in cols_to_fill: + # Convert to numeric; non-numeric -> NaN + s = pd.to_numeric(df[col], errors="coerce") + if s.isna().all(): + # Nothing to interpolate in this column + df[col] = s + continue + + # Identify rows that are NaN between two valid values + orig_na = s.isna() + + # Timestamps of previous/next valid values + prev_t = pd.Series(idx.where(s.notna(), pd.NaT), index=idx).ffill() + next_t = pd.Series(idx.where(s.notna(), pd.NaT), index=idx).bfill() + + # Total gap length between surrounding valid values + total_gap = next_t - prev_t + allowed = ( + orig_na + & prev_t.notna() + & next_t.notna() + & (total_gap <= max_gap) + ) + + # Time-based interpolation only *inside* valid spans + s_interp = s.interpolate(method="time", limit_area="inside") + s_filled = s.copy() + s_filled[allowed] = s_interp[allowed] + + df[col] = s_filled + + time_col_out = resolve_time_column(df.columns) + # --- Prepare output: restore a string time column and drop helper ---------- + out = df.reset_index(drop=False) + # format time once + out_ts = out["__dt"].dt.strftime("%Y-%m-%d %H:%M:%S.%f").str[:23] + # write back into BOTH the chosen original time column and the canonical 'timestamp' + out[time_col_out] = out_ts + out["timestamp"] = out_ts + out = out.drop(columns=["__dt"]) + + # --- Write per-ID CSV ------------------------------------------------------ + id_safe = re.sub(r'[\\/:*?"<>| ]+', "_", str(local_id)).strip("_") + out_path = output_path_template.replace(".csv", f"_{id_safe}__interp_inplace_le1d.csv") + out.to_csv(out_path, index=False) + created_paths.append(out_path) + + return created_paths + + +def normalize_column_names(df): + """ + Normalizes DataFrame column names: + - converts to lower-case + - replaces '-', ':', '.', spaces with '_' + - removes extra underscores at the beginning and end + """ + df = df.copy() + df.columns = [ + re.sub(r"[_]+", "_", re.sub(r"[-:.\s]+", "_", str(col).lower())).strip("_") + for col in df.columns + ] + return df + +def resolve_lon_lat_keys(fieldnames): + """ + Resolve actual longitude/latitude column names from a CSV header. + Returns (lon_key, lat_key) as *original* header strings. + Falls back to 'location-long' / 'location-lat' if present. + """ + import re + + def _norm(s: str) -> str: + return re.sub(r"[-:._\s]+", "_", str(s).lower()).strip("_") + + norm_map = {_norm(c): c for c in fieldnames} + + lon_syn = ["location_long", "location_lon", "location-long", "location-lon", + "longitude", "lon", "location_longitude", "location.longitude"] + lat_syn = ["location_lat", "location-lat", + "latitude", "lat", "location_latitude", "location.latitude"] + + lon_key = next((norm_map[_norm(c)] for c in lon_syn if _norm(c) in norm_map), None) + lat_key = next((norm_map[_norm(c)] for c in lat_syn if _norm(c) in norm_map), None) + + # soft fallback to legacy dash-style names if present + if lon_key is None and "location-long" in fieldnames: + lon_key = "location-long" + if lat_key is None and "location-lat" in fieldnames: + lat_key = "location-lat" + + return lon_key, lat_key + + + +def _norm_key(s: str) -> str: + """Normalize a header key: lower-case and replace - : . space with _.""" + return re.sub(r"[-:._\s]+", "_", str(s).lower()).strip("_") + +def resolve_time_column(fieldnames) -> str: + """ + Pick the ORIGINAL header name that stores timestamps. + Preference order: + 1) TIME_COLUMN (normalized) + 2) 'timestamp' + 3) 'eobs:start-timestamp' / 'eobs_start_timestamp' + 4) 'time', 'datetime', 'date' + Returns: original header name if present; otherwise returns TIME_COLUMN. + """ + # map normalized -> original header + norm_to_orig = {_norm_key(c): c for c in fieldnames} + + candidates = [ + _norm_key(TIME_COLUMN), # whatever the module-level TIME_COLUMN is + "timestamp", + "eobs:start-timestamp", + "eobs_start_timestamp", + "time", "datetime", "date", + ] + for cand in candidates: + nk = _norm_key(cand) + if nk in norm_to_orig: + return norm_to_orig[nk] + return TIME_COLUMN # fallback + +def resolve_id_key(fieldnames) -> str | None: + """ + Return ORIGINAL header name that stores the individual ID. + Supports hyphens/underscores/colons/dots variants. + """ + norm_to_orig = {_norm_key(c): c for c in fieldnames} + candidates = [ + "individual_local_identifier", + "individual-local-identifier", + "individual:local-identifier", + "individual.local.identifier", + ] + for cand in candidates: + nk = _norm_key(cand) + if nk in norm_to_orig: + return norm_to_orig[nk] + return None \ No newline at end of file From 96ff70444fae93e67daabe453542191421f46798 Mon Sep 17 00:00:00 2001 From: olekshche Date: Tue, 26 Aug 2025 14:55:15 +0300 Subject: [PATCH 02/17] Optimized version of calculations for ...nc tab --- ecodata/a_e_f.py | 762 ++++++++++++++++++++++ ecodata/annotation_eng_func.py | 281 +++++--- ecodata/app/apps/annotation_engine_app.py | 10 +- 3 files changed, 972 insertions(+), 81 deletions(-) create mode 100644 ecodata/a_e_f.py diff --git a/ecodata/a_e_f.py b/ecodata/a_e_f.py new file mode 100644 index 0000000..07408ac --- /dev/null +++ b/ecodata/a_e_f.py @@ -0,0 +1,762 @@ +import xarray as xr +import geopandas as gpd +from pathlib import Path +import pandas as pd +import re +from shapely.geometry import Point, box +import numpy as np +from datetime import datetime +import rasterio + +def safe_open_nc_with_time_decoding(path): + """ + Opens a NetCDF file with support for non-standard calendars: + julian, gregorian, 360_day, noleap, etc. + Always returns the 'time' coordinate as a pd.DatetimeIndex, + even if it was originally of cftime type. + """ + + try: + ds = xr.open_dataset(path, decode_times=False) + + time_name = _detect_time_name(ds) + if time_name is None: + raise ValueError("No time-like coordinate/variable found (e.g., 'time', 'valid_time').") + + # if time is in variables but not in coords — make it a coordinate + if time_name in ds.variables and time_name not in ds.coords: + ds = ds.set_coords(time_name) + + time_var = ds[time_name] + units = str(time_var.attrs.get("units","")) + calendar = str(time_var.attrs.get("calendar","standard")).lower() + + if "since" not in units: + # sometimes there are "epoch seconds" without 'since' + # add default: seconds since 1970-01-01 + if units.strip() == "" and pd.api.types.is_integer_dtype(time_var.dtype): + units = "seconds since 1970-01-01" + calendar = "proleptic_gregorian" + + decoded = xr.coding.times.decode_cf_datetime(time_var.values, units, calendar) + # if these are cftime objects — convert via str + if hasattr(decoded[0], "strftime"): + decoded = pd.to_datetime([str(d) for d in decoded]) + else: + decoded = pd.to_datetime(decoded) + + # rename the time coordinate to the unified 'time' + if time_name != "time": + ds = ds.assign_coords({time_name: decoded}).rename({time_name: "time"}) + else: + ds = ds.assign_coords(time=decoded) + + return ds + + except Exception as e: + raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") + +def get_nc_bounds(nc_path: str): + """ + Returns a dictionary of boundaries from .nc in CRS WGS84: {"S": ..., "N": ..., "W": ..., "E": ...} + """ + ds = safe_open_nc_with_time_decoding(nc_path) + # candidate coordinate names + lat_candidates = ("lat", "latitude", "y") + lon_candidates = ("lon", "longitude", "x") + + lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + if lat_name is None or lon_name is None: + raise ValueError("Could not detect lat/lon coordinate names in NetCDF") + + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + ds.close() + return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} + +def load_vector_extent_info(path): + try: + ext = Path(path).suffix.lower() + if ext not in [".shp", ".geojson"]: + raise ValueError("Unsupported file format. Please select a .shp or .geojson file.") + + gdf = gpd.read_file(path) + bounds = gdf.total_bounds # [minx, miny, maxx, maxy] + west, south, east, north = bounds + return path, south, north, west, east + except Exception as e: + raise RuntimeError(f"Failed to load vector file: {e}") + +def load_taxa_and_ids_from_csv(file_path): + """ + Reads a Movebank-style CSV and returns: + - DataFrame + - List of unique taxon names + - List of unique individual IDs + """ + try: + df = pd.read_csv(file_path) + columns = {re.sub(r"[-._\s]+", "_", col.lower()): col for col in df.columns} + id_key = "individual_local_identifier" + taxon_key = "individual_taxon_canonical_name" + id_col = columns.get(id_key) + taxon_col = columns.get(taxon_key) + if id_col is None: + return None, [], [], "No column found for individual-local-identifier" + + unique_ids = sorted(df[id_col].dropna().astype(str).unique()) + unique_taxa = sorted(df[taxon_col].dropna().astype(str).unique()) if taxon_col else [] + + return df, unique_taxa, unique_ids, None + + except Exception as e: + return None, [], [], str(e) + +def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, + boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, + out_csv_path=None): + """ + env_var_map: dict[str, str] — variable → file path + selected_env_vars: list[str] — selected variables + movebank_path: str — path to the Movebank CSV + selected_ids: list[str] — IDs for annotation + boundary_path: str — path to .shp or .geojson + """ + print("[DEBUG] Annotation started") + print("Selected variables:", selected_env_vars) + print("From files:", [env_var_map[v] for v in selected_env_vars]) + print("Selected IDs:", selected_ids) + print("Movebank file:", movebank_path) + print("Boundary file:", boundary_path) + print("Interpolation method:", interpolation_method) + + # === Step 1: Spatial filtering === + df_filtered, _ = filter_points_within_boundary(movebank_path, selected_ids, boundary_path, bbox=bbox) + if df_filtered.empty: + print("[WARNING] No points within the boundary.") + return + + # === Step 2: Loading and interpolation of environmental data === + result = load_selected_environmental_data(df_filtered, env_var_map, + selected_env_vars, movebank_path, + interpolation_method, smoothing_k=smoothing_k) + if result is None: + print("[ERROR] Environmental data was not loaded.") + return + + df_annotated, nc_start, nc_end = result + +#### diagnistic + var = selected_env_vars[0] if selected_env_vars else None + if var in df_annotated.columns: + in_nc = df_annotated["timestamp"].between( + pd.to_datetime(df_annotated["timestamp"]).min() if pd.isna(nc_start) else nc_start, + pd.to_datetime(df_annotated["timestamp"]).max() if pd.isna(nc_end) else nc_end + ) + filled_total = df_annotated[var].notna().sum() + filled_in_nc = df_annotated.loc[in_nc, var].notna().sum() + print(f"[DEBUG] Filled '{var}': total={filled_total}, within-NC-window={filled_in_nc}") + else: + print(f"[WARNING] Column '{var}' not found in annotated DataFrame.") +##### + + # === Step 3: Time filtering === + df_time_filtered = df_annotated.copy() + print("[INFO] Full timestamp range preserved. Outside-NC values will be NaN.") + + # === Step 4: Saving the final result === + if out_csv_path: + out_path = Path(out_csv_path) + else: + out_path = Path(movebank_path).parent / "annotated_env.csv" + df_time_filtered = df_time_filtered.drop(columns=["geometry", "nc_lat", "nc_lon"], errors="ignore") + df_time_filtered.to_csv(out_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") + print(f"[INFO] Final filtered annotation saved to {out_path}") + + # === Step 5: Saving by individual ID === + output_folder = out_path.parent / "annotated_individuals" + output_folder.mkdir(parents=True, exist_ok=True) + + id_col = "individual_local_identifier" + if id_col in df_time_filtered.columns: + unique_ids = df_time_filtered[id_col].dropna().unique() + for uid in unique_ids: + df_id = df_time_filtered[df_time_filtered[id_col] == uid] + safe_uid = re.sub(r"[^\w\-]", "_", str(uid)) + out_file = output_folder / f"annotated_env_{safe_uid}.csv" + df_id.to_csv(out_file, index=False) + print(f"[INFO] Saved {len(unique_ids)} individual files to {output_folder}") + else: + print("[WARNING] Column 'individual_local_identifier' not found. Skipping per-ID export.") + + +def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=None, bbox=None): + print("[DEBUG] Filtering is started") + df = pd.read_csv(movebank_path) + df.columns = [re.sub(r"[-:.\s]+", "_", col.lower()) for col in df.columns] + if "location_long" in df.columns and "location_lon" not in df.columns: + df["location_lon"] = df["location_long"] + if "timestamp" not in df.columns and "eobs_start_timestamp" in df.columns: + df["timestamp"] = df["eobs_start_timestamp"] + + required_cols = {"location_lat", "location_lon", "individual_local_identifier", "timestamp"} + if not required_cols.issubset(df.columns): + raise ValueError(f"Required columns are missing in Movebank file. Missing: {required_cols - set(df.columns)}") + + # ID-filter + df = df[df["individual_local_identifier"].isin(selected_ids)] + df = interpolate_missing_coordinates(df) + + output_path = Path(movebank_path).parent / "trimmed.csv" + if bbox is not None: + S, N, W, E = map(float, (bbox["S"], bbox["N"], bbox["W"], bbox["E"])) + m = df["location_lat"].between(S, N) & df["location_lon"].between(W, E) + df = df.loc[m].copy() + df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] + gdf_filtered = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + try: + if gdf_filtered.empty: + print("[INFO] No points within bbox. File not saved.") + else: + gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] (bbox) Data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save (bbox) data: {e}") + return gdf_filtered, output_path + + # case: boundary from shp/geojson + df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] + gdf_points = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") + + if boundary_path is None: + print("[INFO] No boundary provided. Skipping spatial clipping (all selected IDs kept).") + try: + gdf_points.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] (No-boundary) Data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save (no-boundary) data: {e}") + return gdf_points, output_path + + gdf_boundary = gpd.read_file(boundary_path) + if gdf_boundary.crs != gdf_points.crs: + gdf_boundary = gdf_boundary.to_crs(gdf_points.crs) + + gdf_filtered = gpd.sjoin(gdf_points, gdf_boundary[["geometry"]], predicate="within", how="inner").drop(columns="index_right") + + try: + if gdf_filtered.empty: + print("[INFO] No points within boundary. File not saved.") + else: + gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) + print(f"[INFO] Filtered data saved to {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save filtered data: {e}") + + return gdf_filtered, output_path + +# UNUSED OPTION +def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_end: pd.Timestamp) -> pd.DataFrame: + df = df.copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") #? pd.to_datetime + filtered_df = df[(df["timestamp"] >= nc_start) & (df["timestamp"] <= nc_end)] + print(f"[INFO] Filtered {len(filtered_df)} / {len(df)} rows within NetCDF time range: {nc_start} — {nc_end}") + return filtered_df + +def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: + """ + Interpolates missing values in 'location_lat' and 'location_lon' columns + based on the 'timestamp'. Removes rows with invalid timestamps. + """ + required_cols = {"timestamp", "location_lat", "location_lon"} + if not required_cols.issubset(df.columns): + raise ValueError(f"DataFrame must contain columns: {required_cols}") + + df = df.copy() + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") + + n_missing = df["timestamp"].isna().sum() + if n_missing > 0: + print(f"[INFO] {n_missing} rows with missing or invalid timestamps were removed before interpolation.") + + df = df.dropna(subset=["timestamp"]) # Remove Na before creating the index + df = df.sort_values("timestamp") + df.set_index("timestamp", inplace=True) + + for coord in ["location_lat", "location_lon"]: + df[coord] = pd.to_numeric(df[coord], errors="coerce") + + df[["location_lat", "location_lon"]] = df[["location_lat", "location_lon"]].interpolate( + method="time", limit_direction="both" + ) + + df = df.reset_index() + return df + +def load_selected_environmental_data(df, env_var_map, selected_vars, + movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2): + """ + Wrapper that calls the appropriate annotation function depending on the interpolation method. + Supports: + - "Nearest neighbour (time-linear)" + - "IDW (time-linear)" + """ + label = (interpolation_method or "").strip().lower() + label = label.replace("neighbor", "neighbour") ###?? + + is_nearest = label.startswith("nearest") + is_idw = ("idw" in label) or ("inverse distance" in label) + + if is_nearest: + return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) + elif is_idw: + return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) + else: + raise ValueError(f"Unknown interpolation method: {interpolation_method}") + +def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): + """ + Temporal: linear interpolation (1D per-point) + Spatial: nearest neighbour (1 grid node per point) + """ + from shapely.geometry import Point + + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + + # coordinate placeholders + nc_latitudes = np.full(len(out), np.nan) + nc_longitudes = np.full(len(out), np.nan) + + for var in selected_vars: + file_path = env_var_map.get(var) + out[var] = np.nan + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {var} not found: {file_path}") + continue + + try: + ds = safe_open_nc_with_time_decoding(file_path) + if var not in ds: + print(f"[WARNING] Variable {var} not in {file_path}") + continue + + da = ds[var] + # determine basic axes + dims = list(da.dims) + lat_dim = "lat" if "lat" in dims else "latitude" + lon_dim = "lon" if "lon" in dims else "longitude" + ds = _ensure_sorted(ds, lat_dim, lon_dim) + da = ds[var] + dims = list(da.dims) + time_dim = "time" if "time" in dims else next( + (d for d in ("valid_time","forecast_time","verification_time","t","Time") if d in dims), + None + ) + if time_dim is None: + raise ValueError(f"No time-like dimension in {var}: dims={dims}") + if time_dim != "time": + ds = ds.rename({time_dim: "time"}) + da = ds[var] + dims = list(da.dims) + time_dim = "time" + + # Cut out unnecessary dimentions: pressure_level, number, expver, etc. + extra = [d for d in dims if d not in (time_dim, lat_dim, lon_dim)] + if extra: + sel = {} + for d in extra: + dl = d.lower() + try: + coord = ds.coords[d] if d in ds.coords else ds[d] + except Exception: + coord = None + + if dl in ("pressure_level", "isobaricinhpa", "level"): + idx = 0 + if coord is not None: + try: + vals = np.asarray(coord.values, dtype=float) + # обрати рівень, найближчий до 1000 гПа (як у вашому файлі) + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + idx = int(np.nanargmin(np.abs(vals - 1000.0))) + except Exception: + idx = 0 + sel[d] = idx + else: + # інші дод. виміри → беремо перший елемент + sel[d] = 0 + + da = da.isel(**sel).squeeze() # (time, lat, lon) + + glat = ds[lat_dim].values + glon = ds[lon_dim].values + gtime = pd.to_datetime(ds["time"].values).values # datetime64[ns] + gtime_ts = pd.to_datetime(gtime) # Timestamp indexable + + # one line at a time — only one series of 1 grid + for idx, row in out.iterrows(): + t = row["timestamp"] + xlat = row["location_lat"] + xlon = row["location_lon"] + + # out of time range → NaN + if t < gtime_ts.min() or t > gtime_ts.max(): + continue + + ii = _nearest_index(glat, xlat) + jj = _nearest_index(glon, xlon) + + # extract the time series of one grid + # expect dims ("time", lat_dim, lon_dim) + series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) + val = _interp1d_time(gtime_ts, series, t) + out.at[idx, var] = val + nc_latitudes[idx] = glat[ii] + nc_longitudes[idx] = glon[jj] + + except Exception as e: + print(f"[ERROR] {var}: {e}") + continue + + out["nc_lat"] = nc_latitudes + out["nc_lon"] = nc_longitudes + out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + return out, pd.NaT, pd.NaT + +def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): + """ + Temporal: linear (1D per-point per-neighbour) + Spatial: IDW over k nearest grid nodes (k = smoothing_k, chosen in UI) + """ + from shapely.geometry import Point + + k = max(2, int(smoothing_k)) + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + out["nc_lat"] = out["location_lat"].values + out["nc_lon"] = out["location_lon"].values + + for var in selected_vars: + file_path = env_var_map.get(var) + out[var] = np.nan + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {var} not found: {file_path}") + continue + + try: + ds = safe_open_nc_with_time_decoding(file_path) + if var not in ds: + print(f"[WARNING] Variable {var} not in {file_path}") + continue + + da = ds[var] + dims = set(da.dims) + lat_dim = "lat" if "lat" in dims else "latitude" + lon_dim = "lon" if "lon" in dims else "longitude" + ds = _ensure_sorted(ds, lat_dim, lon_dim) + da = ds[var] + + # — find the name of the time dimension and unify to "time" + dims_list = list(da.dims) + time_dim = "time" if "time" in dims_list else next( + (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims_list), None + ) + if time_dim is None: + raise ValueError(f"No time-like dimension in {var}: dims={dims_list}") + if time_dim != "time": + ds = ds.rename({time_dim: "time"}) + da = ds[var] + dims_list = list(da.dims) + + # — remove unnecessary measurements (pressure, ensemble, etc.) + extra_dims = [d for d in dims_list if d not in ("time", lat_dim, lon_dim)] + if extra_dims: + sel = {} + # special logic for pressure levels: take 1000 hPa if it exists; otherwise the first + level_keys = {"pressure_level", "isobaricInhPa", "level"} + for d in extra_dims: + if d in da.coords and d.lower() in {k.lower() for k in level_keys}: + try: + lev = np.asarray(ds[d].values, dtype=float) + sel[d] = int(np.nanargmin(np.abs(lev - 1000.0))) # closest to 1000 hPa + except Exception: + sel[d] = 0 + else: + sel[d] = 0 + da = da.isel(**sel).squeeze() + + glat = ds[lat_dim].values + glon = ds[lon_dim].values + gtime = pd.to_datetime(ds["time"].values).values + gtime_ts = pd.to_datetime(gtime) + + for idx, row in out.iterrows(): + t = row["timestamp"] + xlat = row["location_lat"] + xlon = row["location_lon"] + + if t < gtime_ts.min() or t > gtime_ts.max(): + continue + + # find k nearest nodes through local window + nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) + + vals = [] + dists = [] + for ii, jj in nn_idx: + series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) + v = _interp1d_time(gtime_ts, series, t) + vals.append(v) + dists.append(np.hypot(glat[ii] - xlat, glon[jj] - xlon)) + + out.at[idx, var] = _idw(vals, dists, p=2) + + except Exception as e: + print(f"[ERROR] {var}: {e}") + continue + + out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + return out, pd.NaT, pd.NaT + +def convert_tif_to_nc_before_annotation(tif_paths, output_dir): + """ + onverts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. + For each variable, builds a data(time, lat, lon) array. + Returns the path to the generated .nc file. + """ + tif_paths = [str(Path(p)) for p in tif_paths] + if not tif_paths: + raise ValueError("No .tif files provided") + + # 1) Group files by variable + by_var = {} + for tif in tif_paths: + vname = parse_appeears_variable_name(tif) + by_var.setdefault(vname, []).append(tif) + + lat = lon = None + data_vars = {} + + for vname, files in by_var.items(): + times = [] + planes = [] + first_geo = True + + for tif in sorted(files): + tif_name = Path(tif).name + t = parse_time_from_filename(tif_name) + times.append(t) + + with rasterio.open(tif) as src: + arr = src.read(1).astype("float32") + nodata = src.nodata + if nodata is not None: + arr = np.where(arr == nodata, np.nan, arr) + + # Read scale_factor from tags (if present); otherwise use a 0.0001 heuristic for int16 NDVI/EVI + scale = None + try: + tags = src.tags() + for k in ("scale_factor", "SCALE", "Scale", "scale"): + if k in tags: + scale = float(tags[k]); break + except Exception: + pass + if scale is None and (np.nanmin(arr) >= -10000) and (np.nanmax(arr) <= 10000): + scale = 0.0001 + if scale is not None: + arr = arr * scale + + planes.append(arr) + + if first_geo: + transform = src.transform + h, w = src.height, src.width + lon = np.array([transform * (i, 0) for i in range(w)])[:, 0] + lat = np.array([transform * (0, j) for j in range(h)])[:, 1] + first_geo = False + + data_array = np.stack(planes) # (time, lat, lon) + time_index = np.array(times) + + da = xr.DataArray( + data_array, + dims=["time", "lat", "lon"], + coords={"time": time_index, "lat": lat, "lon": lon}, + name=vname + ) + data_vars[vname] = da + + ds = xr.Dataset(data_vars) + + base = Path(tif_paths[0]).name.split("_")[0] + safe_base = re.sub(r"[^\w\-]", "_", base) + out = Path(output_dir) / f"{safe_base}_nc_output.nc" + ds.to_netcdf(out) + return str(out) + + + + +def parse_time_from_filename(filename): + """ + Example: MOD13A1.061__500m_16_days_NDVI_doy2014145000000_aid0001.tif + Parses date using "doyYYYYDDD", where DDD is the day of year. + """ + match = re.search(r'doy(\d{4})(\d{3})', filename) + if match: + year, doy = int(match.group(1)), int(match.group(2)) + return datetime.strptime(f"{year}{doy}", "%Y%j") + else: + raise ValueError(f"Cannot parse time from filename: {filename}") + +# --- AppEEARS variable-name parser --- # +def parse_appeears_variable_name(tif_path: str) -> str: + """ + Returns the variable/layer name for an AppEEARS GeoTIFF. + Order: + (A) try reading tags (long_name, DESCRIPTION, Layer...) + (B) if not available — parse the filename: + - token before 'doyYYYYDDD' (typical: ..._NDVI_doy2014145_...) + - or one of the known tokens in KNOWN_TOKENS + (C) fallback -> "data" + """ + + + p = Path(tif_path) + name = p.name + + # A) read TIF tags + try: + with rasterio.open(tif_path) as src: + tags = src.tags() + for key in ("long_name", "DESCRIPTION", "Description", "Layer", "LAYER", "BAND_NAME"): + if key in tags and str(tags[key]).strip(): + raw = str(tags[key]).strip() + var = re.sub(r"[^\w\-]+", "_", raw) + return var + except Exception: + pass + + # B1) token before "doyYYYYDDD" + m = re.search(r"_([A-Za-z0-9][A-Za-z0-9_]+)_doy\d{7}", name) + if m: + return m.group(1) + + # B2) known tokens (common AppEEARS layers; list is incomplete but useful) + KNOWN_TOKENS = { + "NDVI", "EVI", + "LST_Day_1km", "LST_Night_1km", "LST_Day_1KM", "LST_Night_1KM", "QC_Day", "QC_Night", + "Lai_500m", "Fpar_500m", "FparLai_QC", + "Nadir_Reflectance_Band1", "Nadir_Reflectance_Band2", "Nadir_Reflectance_Band3", + "Nadir_Reflectance_Band4", "Nadir_Reflectance_Band5", "Nadir_Reflectance_Band6", + "Nadir_Reflectance_Band7", + "SurfReflect_Band1", "SurfReflect_Band2", "SurfReflect_Band3", + "SurfReflect_Band4", "SurfReflect_Band5", "SurfReflect_Band6", "SurfReflect_Band7", + "NDSI_Snow_Cover", + "VIIRS_NDVI", "VIIRS_EVI", + "BurnDate", "BurnDate_Uncertainty", "LAI", "FPAR", "QC" + } + candidates = sorted([t for t in KNOWN_TOKENS if t in name], key=len, reverse=True) + if candidates: + return candidates[0] + + parts = re.split(r"[_.]", name) + parts = [t for t in parts if t and t.lower() != "tif"] + parts = [t for t in parts if not t.lower().startswith("aid")] + parts = [t for t in parts if not re.fullmatch(r"\d{7,8}", t) and not t.startswith("doy")] + if parts: + parts.sort(key=len, reverse=True) + return parts[0] + + return "data" + + +def _ensure_sorted(ds, lat_dim, lon_dim): + if (np.diff(ds[lat_dim].values) < 0).all(): + ds = ds.sortby(lat_dim) + if (np.diff(ds[lon_dim].values) < 0).all(): + ds = ds.sortby(lon_dim) + return ds + +def _nearest_index(arr, x): + # array arr growing: fast via searchsorted + local check + idx = np.searchsorted(arr, x) + if idx == 0: + return 0 + if idx >= len(arr): + return len(arr) - 1 + return idx if abs(arr[idx] - x) < abs(arr[idx-1] - x) else idx-1 + +def _interp1d_time(grid_times_ts, series_vals, t_target): + """Linear 1D interpolation over time (Timestamp => float64). Ignores NaN in the series.""" + # filter NaN in a series + mask = ~np.isnan(series_vals) + if mask.sum() < 2: + return np.nan + x = grid_times_ts[mask].astype("int64") # ns → int64 + y = series_vals[mask].astype(float) + xi = np.int64(pd.Timestamp(t_target).value) + # if out of range — return NaN + if xi < x.min() or xi > x.max(): + return np.nan + return np.interp(xi, x, y) + +def _k_nearest_indices(glat, glon, xlat, xlon, k): + """Returns an array of indices (ilat, ilon) of length k among candidates from the local window""" + # first the shortest path is the nearest grid + i0 = _nearest_index(glat, xlat) + j0 = _nearest_index(glon, xlon) + + # form a small window around (i0, j0) sufficient to find k neighbors + # empirically: radius r = ceil(max(1, sqrt(k))) → (2r+1)^2 >= k + r = int(np.ceil(max(1, np.sqrt(k)))) + i_min, i_max = max(0, i0 - r), min(len(glat) - 1, i0 + r) + j_min, j_max = max(0, j0 - r), min(len(glon) - 1, j0 + r) + + # collect candidates in the window + cand = [] + for ii in range(i_min, i_max + 1): + for jj in range(j_min, j_max + 1): + d = np.hypot(glat[ii] - xlat, glon[jj] - xlon) + cand.append((d, ii, jj)) + cand.sort(key=lambda t: t[0]) + top = cand[:k] + return [(ii, jj) for _, ii, jj in top] + +def _idw(values, distances, p=2): + """IDW average for already interpolated values. distances > 0 (add eps).""" + vals = np.array(values, dtype=float) + d = np.array(distances, dtype=float) + 1e-12 + w = 1.0 / (d ** p) + # ignore NaN in vals + mask = ~np.isnan(vals) + if not mask.any(): + return np.nan + w_sel = w[mask] + v_sel = vals[mask] + return np.sum(w_sel * v_sel) / np.sum(w_sel) + +# --- NEW: helper --- +def _detect_time_name(ds): + # 1)quick candidates by name + name_candidates = ("time","valid_time","forecast_time","verification_time","t","Time","datetime","date") + for c in name_candidates: + if c in ds.coords or c in ds.variables: + return c + + # 2) CF attributes: standard_name = "time" or units with the word "since" + for name, var in ds.variables.items(): + stdn = str(var.attrs.get("standard_name","")).lower() + units = str(var.attrs.get("units","")) + if stdn == "time": + return name + if "since" in units: + return name + return None \ No newline at end of file diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index 21deda6..9eff0c3 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -17,7 +17,7 @@ def safe_open_nc_with_time_decoding(path): """ try: - ds = xr.open_dataset(path, decode_times=False) + ds = xr.open_dataset(path, decode_times=False, chunks="auto") time_name = _detect_time_name(ds) if time_name is None: @@ -55,6 +55,30 @@ def safe_open_nc_with_time_decoding(path): except Exception as e: raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") + +#### *** Attempt at optimization +def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str]): + """ + Return union [nc_start, nc_end] across all selected variables. + If time is missing for all → (None, None). + """ + nc_start, nc_end = None, None + for v in (selected_env_vars or []): + nc_path = env_var_map.get(v) + if not nc_path: + continue + ds = safe_open_nc_with_time_decoding(nc_path) + try: + if ("time" in ds.coords) or ("time" in ds.variables): + tmin = pd.to_datetime(ds["time"].values.min()) + tmax = pd.to_datetime(ds["time"].values.max()) + nc_start = tmin if (nc_start is None or tmin < nc_start) else nc_start + nc_end = tmax if (nc_end is None or tmax > nc_end) else nc_end + finally: + ds.close() + return nc_start, nc_end + +#### *** def get_nc_bounds(nc_path: str): """ @@ -138,6 +162,14 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele if df_filtered.empty: print("[WARNING] No points within the boundary.") return + + # ===*** Time prefiltering (union across selected variables) === + nc_start, nc_end = get_nc_timerange_for_selected(env_var_map, selected_env_vars) + df_filtered = filter_points_within_timerange(df_filtered, nc_start, nc_end) + if df_filtered.empty: + print("[WARNING] No points within the NC time window after prefiltering.") + return + # ===*** # === Step 2: Loading and interpolation of environmental data === result = load_selected_environmental_data(df_filtered, env_var_map, @@ -149,6 +181,20 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele df_annotated, nc_start, nc_end = result +#### diagnostic + var = selected_env_vars[0] if selected_env_vars else None + if var in df_annotated.columns: + in_nc = df_annotated["timestamp"].between( + pd.to_datetime(df_annotated["timestamp"]).min() if pd.isna(nc_start) else nc_start, + pd.to_datetime(df_annotated["timestamp"]).max() if pd.isna(nc_end) else nc_end + ) + filled_total = df_annotated[var].notna().sum() + filled_in_nc = df_annotated.loc[in_nc, var].notna().sum() + print(f"[DEBUG] Filled '{var}': total={filled_total}, within-NC-window={filled_in_nc}") + else: + print(f"[WARNING] Column '{var}' not found in annotated DataFrame.") +##### + # === Step 3: Time filtering === df_time_filtered = df_annotated.copy() print("[INFO] Full timestamp range preserved. Outside-NC values will be NaN.") @@ -247,9 +293,13 @@ def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=Non # UNUSED OPTION def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_end: pd.Timestamp) -> pd.DataFrame: df = df.copy() - df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") #? pd.to_datetime + if nc_start is None or nc_end is None: + print("[INFO] NC union time range unavailable. Skipping time prefilter.") + return df + df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") + before = len(df) filtered_df = df[(df["timestamp"] >= nc_start) & (df["timestamp"] <= nc_end)] - print(f"[INFO] Filtered {len(filtered_df)} / {len(df)} rows within NetCDF time range: {nc_start} — {nc_end}") + print(f"[INFO] Time-prefiltered rows: {len(filtered_df)} / {before} within [{nc_start} .. {nc_end}]") return filtered_df def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: @@ -291,7 +341,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, - "IDW (time-linear)" """ label = (interpolation_method or "").strip().lower() - label = label.replace("neighbor", "neighbour") ###?? + label = label.replace("neighbor", "neighbour") # Normalise US/UK spelling is_nearest = label.startswith("nearest") is_idw = ("idw" in label) or ("inverse distance" in label) @@ -305,7 +355,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): """ - Temporal: linear interpolation (1D per-point) + Temporal: vectorised linear interpolation over time (per grid-cell group) Spatial: nearest neighbour (1 grid node per point) """ from shapely.geometry import Point @@ -314,9 +364,20 @@ def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothin out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) - # coordinate placeholders - nc_latitudes = np.full(len(out), np.nan) - nc_longitudes = np.full(len(out), np.nan) + # placeholders for nearest grid coords + nc_latitudes = np.full(len(out), np.nan, dtype="float64") + nc_longitudes = np.full(len(out), np.nan, dtype="float64") + + # precompute numeric times for vectorised interp + tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") + + # helper: vectorised nearest-index for monotonic arrays + def _nearest_indices_vectorized(arr, vals): + idx = np.searchsorted(arr, vals) + idx = np.clip(idx, 0, len(arr) - 1) + left = np.maximum(idx - 1, 0) + take_left = (idx > 0) & (np.abs(arr[left] - vals) <= np.abs(arr[idx] - vals)) + return np.where(take_left, left, idx) for var in selected_vars: file_path = env_var_map.get(var) @@ -329,30 +390,32 @@ def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothin ds = safe_open_nc_with_time_decoding(file_path) if var not in ds: print(f"[WARNING] Variable {var} not in {file_path}") + ds.close() continue da = ds[var] - # determine basic axes + # detect dims dims = list(da.dims) lat_dim = "lat" if "lat" in dims else "latitude" lon_dim = "lon" if "lon" in dims else "longitude" ds = _ensure_sorted(ds, lat_dim, lon_dim) da = ds[var] dims = list(da.dims) + + # unify time dim to "time" time_dim = "time" if "time" in dims else next( - (d for d in ("valid_time","forecast_time","verification_time","t","Time") if d in dims), - None + (d for d in ("valid_time","forecast_time","verification_time","t","Time") if d in dims), None ) if time_dim is None: + ds.close() raise ValueError(f"No time-like dimension in {var}: dims={dims}") if time_dim != "time": ds = ds.rename({time_dim: "time"}) da = ds[var] dims = list(da.dims) - time_dim = "time" - # Cut out unnecessary dimentions: pressure_level, number, expver, etc. - extra = [d for d in dims if d not in (time_dim, lat_dim, lon_dim)] + # slice away extra dims once (pressure level, ensemble, etc.) + extra = [d for d in dims if d not in ("time", lat_dim, lon_dim)] if extra: sel = {} for d in extra: @@ -361,49 +424,70 @@ def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothin coord = ds.coords[d] if d in ds.coords else ds[d] except Exception: coord = None - if dl in ("pressure_level", "isobaricinhpa", "level"): idx = 0 if coord is not None: try: vals = np.asarray(coord.values, dtype=float) - # обрати рівень, найближчий до 1000 гПа (як у вашому файлі) - #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! idx = int(np.nanargmin(np.abs(vals - 1000.0))) except Exception: idx = 0 sel[d] = idx else: - # інші дод. виміри → беремо перший елемент sel[d] = 0 - - da = da.isel(**sel).squeeze() # (time, lat, lon) + da = da.isel(**sel).squeeze() # -> (time, lat, lon) glat = ds[lat_dim].values glon = ds[lon_dim].values - gtime = pd.to_datetime(ds["time"].values).values # datetime64[ns] - gtime_ts = pd.to_datetime(gtime) # Timestamp indexable + gtime = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") + + # Vectorised nearest-cell lookup for all point + lat_idx = _nearest_indices_vectorized(glat, out["location_lat"].to_numpy(dtype="float64")) + lon_idx = _nearest_indices_vectorized(glon, out["location_lon"].to_numpy(dtype="float64")) + + # Store nc_lat/nc_lon + nc_latitudes[:] = glat[lat_idx] + nc_longitudes[:] = glon[lon_idx] + + # Group by grid cell; map cell code -> unique index + cell_code = (lat_idx.astype(np.int64) * len(glon)) + lon_idx.astype(np.int64) + unique_cells, inverse = np.unique(cell_code, return_inverse=True) + + # Cache per-cell time series (to avoid re-reading the .nc repeatedly) + series_cache = {} + + # Vectorised interpolation for each group + for g, code in enumerate(unique_cells): + ii = int(code // len(glon)) + jj = int(code % len(glon)) + + # take all positions in this cell + pos = np.nonzero(inverse == g)[0] + xi = tgt_times[pos] + + # we read the time series of this cell only once + key = (ii, jj) + if key not in series_cache: + # .values ​​reads (time,) one series; with dask it's 1 read/calculation + series_cache[key] = da.isel({lat_dim: ii, lon_dim: jj}).values.astype("float64") + y = series_cache[key] + + # mask of valid + m = np.isfinite(y) + if m.sum() < 2: + out.iloc[pos, out.columns.get_loc(var)] = np.nan + continue - # one line at a time — only one series of 1 grid - for idx, row in out.iterrows(): - t = row["timestamp"] - xlat = row["location_lat"] - xlon = row["location_lon"] + x = gtime[m] + yy = y[m] - # out of time range → NaN - if t < gtime_ts.min() or t > gtime_ts.max(): - continue + # np.interp: fast, but does not put NaN out of range - will set it ourselves + vals = np.interp(xi, x, yy) + vals[(xi < x.min()) | (xi > x.max())] = np.nan - ii = _nearest_index(glat, xlat) - jj = _nearest_index(glon, xlon) + out.iloc[pos, out.columns.get_loc(var)] = vals - # extract the time series of one grid - # expect dims ("time", lat_dim, lon_dim) - series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) - val = _interp1d_time(gtime_ts, series, t) - out.at[idx, var] = val - nc_latitudes[idx] = glat[ii] - nc_longitudes[idx] = glon[jj] + ds.close() except Exception as e: print(f"[ERROR] {var}: {e}") @@ -416,8 +500,11 @@ def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothin def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): """ - Temporal: linear (1D per-point per-neighbour) - Spatial: IDW over k nearest grid nodes (k = smoothing_k, chosen in UI) + Temporal: linear (1D), vectorised in time via np.interp with a cache of per-cell time series. + Spatial: IDW over k nearest grid nodes (k = smoothing_k). + No external libraries. Cache: + - series_cache[(ii, jj)] -> (x_valid_int64, y_valid_float64) for cell (lat_idx, lon_idx). + This removes repeated da.isel(...).values calls for the same neighbouring cells across rows. """ from shapely.geometry import Point @@ -428,6 +515,11 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: out["nc_lat"] = out["location_lat"].values out["nc_lon"] = out["location_lon"].values + # Target times as int64 ns (for fast np.interp) + tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") + lat_vals = out["location_lat"].to_numpy(dtype="float64") + lon_vals = out["location_lon"].to_numpy(dtype="float64") + for var in selected_vars: file_path = env_var_map.get(var) out[var] = np.nan @@ -439,69 +531,105 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: ds = safe_open_nc_with_time_decoding(file_path) if var not in ds: print(f"[WARNING] Variable {var} not in {file_path}") + ds.close() continue da = ds[var] - dims = set(da.dims) + dims = list(da.dims) lat_dim = "lat" if "lat" in dims else "latitude" lon_dim = "lon" if "lon" in dims else "longitude" ds = _ensure_sorted(ds, lat_dim, lon_dim) da = ds[var] + dims = list(da.dims) - # — find the name of the time dimension and unify to "time" - dims_list = list(da.dims) - time_dim = "time" if "time" in dims_list else next( - (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims_list), None + + time_dim = "time" if "time" in dims else next( + (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), None ) if time_dim is None: - raise ValueError(f"No time-like dimension in {var}: dims={dims_list}") + ds.close() + raise ValueError(f"No time-like dimension in {var}: dims={dims}") if time_dim != "time": ds = ds.rename({time_dim: "time"}) da = ds[var] - dims_list = list(da.dims) + dims = list(da.dims) - # — remove unnecessary measurements (pressure, ensemble, etc.) - extra_dims = [d for d in dims_list if d not in ("time", lat_dim, lon_dim)] + # Remove unnecessary measurements (pressure/ensemble/expver → 0th or closest to 1000 hPa) + extra_dims = [d for d in dims if d not in ("time", lat_dim, lon_dim)] if extra_dims: sel = {} - # special logic for pressure levels: take 1000 hPa if it exists; otherwise the first - level_keys = {"pressure_level", "isobaricInhPa", "level"} for d in extra_dims: - if d in da.coords and d.lower() in {k.lower() for k in level_keys}: - try: - lev = np.asarray(ds[d].values, dtype=float) - sel[d] = int(np.nanargmin(np.abs(lev - 1000.0))) # closest to 1000 hPa - except Exception: - sel[d] = 0 + dl = d.lower() + try: + coord = ds.coords[d] if d in ds.coords else ds[d] + except Exception: + coord = None + if dl in ("pressure_level", "isobaricinhpa", "level"): + idx = 0 + if coord is not None: + try: + vals = np.asarray(coord.values, dtype=float) + idx = int(np.nanargmin(np.abs(vals - 1000.0))) + except Exception: + idx = 0 + sel[d] = idx else: sel[d] = 0 - da = da.isel(**sel).squeeze() + da = da.isel(**sel).squeeze() # -> (time, lat, lon) glat = ds[lat_dim].values glon = ds[lon_dim].values - gtime = pd.to_datetime(ds["time"].values).values - gtime_ts = pd.to_datetime(gtime) + gtime_int = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") - for idx, row in out.iterrows(): - t = row["timestamp"] - xlat = row["location_lat"] - xlon = row["location_lon"] + # *** CACHE of per-cell time series *** + # key: (ii, jj) -> (x_int64_valid, y_float64_valid) + series_cache: dict[tuple[int, int], tuple[np.ndarray, np.ndarray]] = {} + col_idx = out.columns.get_loc(var) - if t < gtime_ts.min() or t > gtime_ts.max(): + # # Main loop over rows (without repeated da.isel reads now) + for idx in range(len(out)): + t_i = tgt_times[idx] + xlat = lat_vals[idx] + xlon = lon_vals[idx] + + # out of time range → NaN + if t_i < gtime_int.min() or t_i > gtime_int.max(): continue - # find k nearest nodes through local window + # find k neighbors (window around nearest) nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) + vals = np.empty(k, dtype="float64") + dists = np.empty(k, dtype="float64") + + for j, (ii, jj) in enumerate(nn_idx): + key = (ii, jj) + if key not in series_cache: + # read the cell's time series once + y = da.isel({lat_dim: ii, lon_dim: jj}).values.astype("float64") + m = np.isfinite(y) + if m.sum() >= 2: + x = gtime_int[m] + yy = y[m] + else: + x = np.empty(0, dtype="int64") + yy = np.empty(0, dtype="float64") + series_cache[key] = (x, yy) + + x, yy = series_cache[key] + if x.size < 2: + vals[j] = np.nan + else: + v = np.interp(t_i, x, yy) + if (t_i < x.min()) or (t_i > x.max()): + v = np.nan + vals[j] = v - vals = [] - dists = [] - for ii, jj in nn_idx: - series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) - v = _interp1d_time(gtime_ts, series, t) - vals.append(v) - dists.append(np.hypot(glat[ii] - xlat, glon[jj] - xlon)) + # Geodistance (planar Euclidean in degrees; as before) + dists[j] = np.hypot(glat[ii] - xlat, glon[jj] - xlon) - out.at[idx, var] = _idw(vals, dists, p=2) + out.iloc[idx, col_idx] = _idw(vals, dists, p=2) + + ds.close() except Exception as e: print(f"[ERROR] {var}: {e}") @@ -510,9 +638,10 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] return out, pd.NaT, pd.NaT + def convert_tif_to_nc_before_annotation(tif_paths, output_dir): """ - onverts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. + Converts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. For each variable, builds a data(time, lat, lon) array. Returns the path to the generated .nc file. """ diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index 16aa0a5..0566b9f 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -89,7 +89,7 @@ class movebank_annotation_engine(param.Parameterized): ) interpolation_method = pn.widgets.Select( name="Interpolation method (spatial)", - options=["Nearest neighbour (time-linear)", "Inverse Distance Weighting (time-linear)"], + options=["Nearest neighbor (time-linear)", "Inverse Distance Weighting (time-linear)"], value="Inverse Distance Weighting (time-linear)" ) make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") @@ -133,7 +133,7 @@ class movebank_annotation_engine(param.Parameterized): tif_interpolation_method = pn.widgets.Select( name="Interpolation method (spatial)", - options=["Nearest neighbour (time-linear)", "Inverse Distance Weighting (time-linear)"], + options=["Nearest neighbor (time-linear)", "Inverse Distance Weighting (time-linear)"], value="Inverse Distance Weighting (time-linear)" ) tif_make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") @@ -936,7 +936,7 @@ def run_annotation_tif(self, *events): # Interpolation and time-fit options (prefer TIF-tab widgets; fallback to NC-tab) interp_widget = getattr(self, "tif_interpolation_method", None) - interp_method = getattr(interp_widget, "value", "Nearest neighbour (time-linear)") + interp_method = getattr(interp_widget, "value", "Nearest neighbor (time-linear)") # Output CSV path (optional) out_widget = getattr(self, "tif_output_path", None) @@ -1304,7 +1304,7 @@ def _auto_height(self, pane, line_px=22, padding=8): def _update_smoothing_options(self, event): """Updates options for control_smoothing depending on interpolation method (.nc).""" - if event.new.startswith("Nearest neighbour"): + if event.new.startswith("Nearest neighbor"): self.control_smoothing.options = ["1"] self.control_smoothing.value = "1" else: @@ -1314,7 +1314,7 @@ def _update_smoothing_options(self, event): def _update_smoothing_options_tif(self, event): """Updates options for control_smoothing depending on interpolation method(.tif).""" - if event.new.startswith("Nearest neighbour"): + if event.new.startswith("Nearest neighbor"): self.tif_control_smoothing.options = ["1"] self.tif_control_smoothing.value = "1" else: From 7baf23d1c7a3f68a99300319f425fc75752852dd Mon Sep 17 00:00:00 2001 From: olekshche Date: Mon, 1 Sep 2025 17:08:53 +0300 Subject: [PATCH 03/17] feat: support vertical levels in NetCDF vars (UI + annotation) --- ecodata/__init__.py | 3 +- ecodata/a_e_f.py | 762 ---------------------- ecodata/annotation_eng_func.py | 375 ++++++----- ecodata/app/apps/annotation_engine_app.py | 63 +- ecodata/movebank_functions.py | 69 +- 5 files changed, 327 insertions(+), 945 deletions(-) delete mode 100644 ecodata/a_e_f.py diff --git a/ecodata/__init__.py b/ecodata/__init__.py index 3f54aa5..dbb2c1b 100644 --- a/ecodata/__init__.py +++ b/ecodata/__init__.py @@ -50,7 +50,8 @@ process_csv_interp_or_averaging, # noqa validate_and_process_csv, merge_csv_files_from_folder, - generate_individual_csvs_for_local_ids + generate_individual_csvs_for_local_ids, + delete_files ) from ecodata.annotation_eng_func import( load_vector_extent_info, diff --git a/ecodata/a_e_f.py b/ecodata/a_e_f.py deleted file mode 100644 index 07408ac..0000000 --- a/ecodata/a_e_f.py +++ /dev/null @@ -1,762 +0,0 @@ -import xarray as xr -import geopandas as gpd -from pathlib import Path -import pandas as pd -import re -from shapely.geometry import Point, box -import numpy as np -from datetime import datetime -import rasterio - -def safe_open_nc_with_time_decoding(path): - """ - Opens a NetCDF file with support for non-standard calendars: - julian, gregorian, 360_day, noleap, etc. - Always returns the 'time' coordinate as a pd.DatetimeIndex, - even if it was originally of cftime type. - """ - - try: - ds = xr.open_dataset(path, decode_times=False) - - time_name = _detect_time_name(ds) - if time_name is None: - raise ValueError("No time-like coordinate/variable found (e.g., 'time', 'valid_time').") - - # if time is in variables but not in coords — make it a coordinate - if time_name in ds.variables and time_name not in ds.coords: - ds = ds.set_coords(time_name) - - time_var = ds[time_name] - units = str(time_var.attrs.get("units","")) - calendar = str(time_var.attrs.get("calendar","standard")).lower() - - if "since" not in units: - # sometimes there are "epoch seconds" without 'since' - # add default: seconds since 1970-01-01 - if units.strip() == "" and pd.api.types.is_integer_dtype(time_var.dtype): - units = "seconds since 1970-01-01" - calendar = "proleptic_gregorian" - - decoded = xr.coding.times.decode_cf_datetime(time_var.values, units, calendar) - # if these are cftime objects — convert via str - if hasattr(decoded[0], "strftime"): - decoded = pd.to_datetime([str(d) for d in decoded]) - else: - decoded = pd.to_datetime(decoded) - - # rename the time coordinate to the unified 'time' - if time_name != "time": - ds = ds.assign_coords({time_name: decoded}).rename({time_name: "time"}) - else: - ds = ds.assign_coords(time=decoded) - - return ds - - except Exception as e: - raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") - -def get_nc_bounds(nc_path: str): - """ - Returns a dictionary of boundaries from .nc in CRS WGS84: {"S": ..., "N": ..., "W": ..., "E": ...} - """ - ds = safe_open_nc_with_time_decoding(nc_path) - # candidate coordinate names - lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x") - - lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) - lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) - if lat_name is None or lon_name is None: - raise ValueError("Could not detect lat/lon coordinate names in NetCDF") - - lat_min = float(ds[lat_name].min()) - lat_max = float(ds[lat_name].max()) - lon_min = float(ds[lon_name].min()) - lon_max = float(ds[lon_name].max()) - ds.close() - return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} - -def load_vector_extent_info(path): - try: - ext = Path(path).suffix.lower() - if ext not in [".shp", ".geojson"]: - raise ValueError("Unsupported file format. Please select a .shp or .geojson file.") - - gdf = gpd.read_file(path) - bounds = gdf.total_bounds # [minx, miny, maxx, maxy] - west, south, east, north = bounds - return path, south, north, west, east - except Exception as e: - raise RuntimeError(f"Failed to load vector file: {e}") - -def load_taxa_and_ids_from_csv(file_path): - """ - Reads a Movebank-style CSV and returns: - - DataFrame - - List of unique taxon names - - List of unique individual IDs - """ - try: - df = pd.read_csv(file_path) - columns = {re.sub(r"[-._\s]+", "_", col.lower()): col for col in df.columns} - id_key = "individual_local_identifier" - taxon_key = "individual_taxon_canonical_name" - id_col = columns.get(id_key) - taxon_col = columns.get(taxon_key) - if id_col is None: - return None, [], [], "No column found for individual-local-identifier" - - unique_ids = sorted(df[id_col].dropna().astype(str).unique()) - unique_taxa = sorted(df[taxon_col].dropna().astype(str).unique()) if taxon_col else [] - - return df, unique_taxa, unique_ids, None - - except Exception as e: - return None, [], [], str(e) - -def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, - boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, - out_csv_path=None): - """ - env_var_map: dict[str, str] — variable → file path - selected_env_vars: list[str] — selected variables - movebank_path: str — path to the Movebank CSV - selected_ids: list[str] — IDs for annotation - boundary_path: str — path to .shp or .geojson - """ - print("[DEBUG] Annotation started") - print("Selected variables:", selected_env_vars) - print("From files:", [env_var_map[v] for v in selected_env_vars]) - print("Selected IDs:", selected_ids) - print("Movebank file:", movebank_path) - print("Boundary file:", boundary_path) - print("Interpolation method:", interpolation_method) - - # === Step 1: Spatial filtering === - df_filtered, _ = filter_points_within_boundary(movebank_path, selected_ids, boundary_path, bbox=bbox) - if df_filtered.empty: - print("[WARNING] No points within the boundary.") - return - - # === Step 2: Loading and interpolation of environmental data === - result = load_selected_environmental_data(df_filtered, env_var_map, - selected_env_vars, movebank_path, - interpolation_method, smoothing_k=smoothing_k) - if result is None: - print("[ERROR] Environmental data was not loaded.") - return - - df_annotated, nc_start, nc_end = result - -#### diagnistic - var = selected_env_vars[0] if selected_env_vars else None - if var in df_annotated.columns: - in_nc = df_annotated["timestamp"].between( - pd.to_datetime(df_annotated["timestamp"]).min() if pd.isna(nc_start) else nc_start, - pd.to_datetime(df_annotated["timestamp"]).max() if pd.isna(nc_end) else nc_end - ) - filled_total = df_annotated[var].notna().sum() - filled_in_nc = df_annotated.loc[in_nc, var].notna().sum() - print(f"[DEBUG] Filled '{var}': total={filled_total}, within-NC-window={filled_in_nc}") - else: - print(f"[WARNING] Column '{var}' not found in annotated DataFrame.") -##### - - # === Step 3: Time filtering === - df_time_filtered = df_annotated.copy() - print("[INFO] Full timestamp range preserved. Outside-NC values will be NaN.") - - # === Step 4: Saving the final result === - if out_csv_path: - out_path = Path(out_csv_path) - else: - out_path = Path(movebank_path).parent / "annotated_env.csv" - df_time_filtered = df_time_filtered.drop(columns=["geometry", "nc_lat", "nc_lon"], errors="ignore") - df_time_filtered.to_csv(out_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") - print(f"[INFO] Final filtered annotation saved to {out_path}") - - # === Step 5: Saving by individual ID === - output_folder = out_path.parent / "annotated_individuals" - output_folder.mkdir(parents=True, exist_ok=True) - - id_col = "individual_local_identifier" - if id_col in df_time_filtered.columns: - unique_ids = df_time_filtered[id_col].dropna().unique() - for uid in unique_ids: - df_id = df_time_filtered[df_time_filtered[id_col] == uid] - safe_uid = re.sub(r"[^\w\-]", "_", str(uid)) - out_file = output_folder / f"annotated_env_{safe_uid}.csv" - df_id.to_csv(out_file, index=False) - print(f"[INFO] Saved {len(unique_ids)} individual files to {output_folder}") - else: - print("[WARNING] Column 'individual_local_identifier' not found. Skipping per-ID export.") - - -def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=None, bbox=None): - print("[DEBUG] Filtering is started") - df = pd.read_csv(movebank_path) - df.columns = [re.sub(r"[-:.\s]+", "_", col.lower()) for col in df.columns] - if "location_long" in df.columns and "location_lon" not in df.columns: - df["location_lon"] = df["location_long"] - if "timestamp" not in df.columns and "eobs_start_timestamp" in df.columns: - df["timestamp"] = df["eobs_start_timestamp"] - - required_cols = {"location_lat", "location_lon", "individual_local_identifier", "timestamp"} - if not required_cols.issubset(df.columns): - raise ValueError(f"Required columns are missing in Movebank file. Missing: {required_cols - set(df.columns)}") - - # ID-filter - df = df[df["individual_local_identifier"].isin(selected_ids)] - df = interpolate_missing_coordinates(df) - - output_path = Path(movebank_path).parent / "trimmed.csv" - if bbox is not None: - S, N, W, E = map(float, (bbox["S"], bbox["N"], bbox["W"], bbox["E"])) - m = df["location_lat"].between(S, N) & df["location_lon"].between(W, E) - df = df.loc[m].copy() - df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] - gdf_filtered = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") - - try: - if gdf_filtered.empty: - print("[INFO] No points within bbox. File not saved.") - else: - gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) - print(f"[INFO] (bbox) Data saved to {output_path}") - except Exception as e: - print(f"[ERROR] Failed to save (bbox) data: {e}") - return gdf_filtered, output_path - - # case: boundary from shp/geojson - df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] - gdf_points = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") - - if boundary_path is None: - print("[INFO] No boundary provided. Skipping spatial clipping (all selected IDs kept).") - try: - gdf_points.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) - print(f"[INFO] (No-boundary) Data saved to {output_path}") - except Exception as e: - print(f"[ERROR] Failed to save (no-boundary) data: {e}") - return gdf_points, output_path - - gdf_boundary = gpd.read_file(boundary_path) - if gdf_boundary.crs != gdf_points.crs: - gdf_boundary = gdf_boundary.to_crs(gdf_points.crs) - - gdf_filtered = gpd.sjoin(gdf_points, gdf_boundary[["geometry"]], predicate="within", how="inner").drop(columns="index_right") - - try: - if gdf_filtered.empty: - print("[INFO] No points within boundary. File not saved.") - else: - gdf_filtered.drop(columns=["geometry"], errors="ignore").to_csv(output_path, index=False) - print(f"[INFO] Filtered data saved to {output_path}") - except Exception as e: - print(f"[ERROR] Failed to save filtered data: {e}") - - return gdf_filtered, output_path - -# UNUSED OPTION -def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_end: pd.Timestamp) -> pd.DataFrame: - df = df.copy() - df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") #? pd.to_datetime - filtered_df = df[(df["timestamp"] >= nc_start) & (df["timestamp"] <= nc_end)] - print(f"[INFO] Filtered {len(filtered_df)} / {len(df)} rows within NetCDF time range: {nc_start} — {nc_end}") - return filtered_df - -def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: - """ - Interpolates missing values in 'location_lat' and 'location_lon' columns - based on the 'timestamp'. Removes rows with invalid timestamps. - """ - required_cols = {"timestamp", "location_lat", "location_lon"} - if not required_cols.issubset(df.columns): - raise ValueError(f"DataFrame must contain columns: {required_cols}") - - df = df.copy() - df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=True, errors="coerce") - - n_missing = df["timestamp"].isna().sum() - if n_missing > 0: - print(f"[INFO] {n_missing} rows with missing or invalid timestamps were removed before interpolation.") - - df = df.dropna(subset=["timestamp"]) # Remove Na before creating the index - df = df.sort_values("timestamp") - df.set_index("timestamp", inplace=True) - - for coord in ["location_lat", "location_lon"]: - df[coord] = pd.to_numeric(df[coord], errors="coerce") - - df[["location_lat", "location_lon"]] = df[["location_lat", "location_lon"]].interpolate( - method="time", limit_direction="both" - ) - - df = df.reset_index() - return df - -def load_selected_environmental_data(df, env_var_map, selected_vars, - movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2): - """ - Wrapper that calls the appropriate annotation function depending on the interpolation method. - Supports: - - "Nearest neighbour (time-linear)" - - "IDW (time-linear)" - """ - label = (interpolation_method or "").strip().lower() - label = label.replace("neighbor", "neighbour") ###?? - - is_nearest = label.startswith("nearest") - is_idw = ("idw" in label) or ("inverse distance" in label) - - if is_nearest: - return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) - elif is_idw: - return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) - else: - raise ValueError(f"Unknown interpolation method: {interpolation_method}") - -def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): - """ - Temporal: linear interpolation (1D per-point) - Spatial: nearest neighbour (1 grid node per point) - """ - from shapely.geometry import Point - - out = df.copy() - out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") - out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) - - # coordinate placeholders - nc_latitudes = np.full(len(out), np.nan) - nc_longitudes = np.full(len(out), np.nan) - - for var in selected_vars: - file_path = env_var_map.get(var) - out[var] = np.nan - if not file_path or not Path(file_path).is_file(): - print(f"[WARNING] File for {var} not found: {file_path}") - continue - - try: - ds = safe_open_nc_with_time_decoding(file_path) - if var not in ds: - print(f"[WARNING] Variable {var} not in {file_path}") - continue - - da = ds[var] - # determine basic axes - dims = list(da.dims) - lat_dim = "lat" if "lat" in dims else "latitude" - lon_dim = "lon" if "lon" in dims else "longitude" - ds = _ensure_sorted(ds, lat_dim, lon_dim) - da = ds[var] - dims = list(da.dims) - time_dim = "time" if "time" in dims else next( - (d for d in ("valid_time","forecast_time","verification_time","t","Time") if d in dims), - None - ) - if time_dim is None: - raise ValueError(f"No time-like dimension in {var}: dims={dims}") - if time_dim != "time": - ds = ds.rename({time_dim: "time"}) - da = ds[var] - dims = list(da.dims) - time_dim = "time" - - # Cut out unnecessary dimentions: pressure_level, number, expver, etc. - extra = [d for d in dims if d not in (time_dim, lat_dim, lon_dim)] - if extra: - sel = {} - for d in extra: - dl = d.lower() - try: - coord = ds.coords[d] if d in ds.coords else ds[d] - except Exception: - coord = None - - if dl in ("pressure_level", "isobaricinhpa", "level"): - idx = 0 - if coord is not None: - try: - vals = np.asarray(coord.values, dtype=float) - # обрати рівень, найближчий до 1000 гПа (як у вашому файлі) - #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - idx = int(np.nanargmin(np.abs(vals - 1000.0))) - except Exception: - idx = 0 - sel[d] = idx - else: - # інші дод. виміри → беремо перший елемент - sel[d] = 0 - - da = da.isel(**sel).squeeze() # (time, lat, lon) - - glat = ds[lat_dim].values - glon = ds[lon_dim].values - gtime = pd.to_datetime(ds["time"].values).values # datetime64[ns] - gtime_ts = pd.to_datetime(gtime) # Timestamp indexable - - # one line at a time — only one series of 1 grid - for idx, row in out.iterrows(): - t = row["timestamp"] - xlat = row["location_lat"] - xlon = row["location_lon"] - - # out of time range → NaN - if t < gtime_ts.min() or t > gtime_ts.max(): - continue - - ii = _nearest_index(glat, xlat) - jj = _nearest_index(glon, xlon) - - # extract the time series of one grid - # expect dims ("time", lat_dim, lon_dim) - series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) - val = _interp1d_time(gtime_ts, series, t) - out.at[idx, var] = val - nc_latitudes[idx] = glat[ii] - nc_longitudes[idx] = glon[jj] - - except Exception as e: - print(f"[ERROR] {var}: {e}") - continue - - out["nc_lat"] = nc_latitudes - out["nc_lon"] = nc_longitudes - out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] - return out, pd.NaT, pd.NaT - -def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): - """ - Temporal: linear (1D per-point per-neighbour) - Spatial: IDW over k nearest grid nodes (k = smoothing_k, chosen in UI) - """ - from shapely.geometry import Point - - k = max(2, int(smoothing_k)) - out = df.copy() - out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") - out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) - out["nc_lat"] = out["location_lat"].values - out["nc_lon"] = out["location_lon"].values - - for var in selected_vars: - file_path = env_var_map.get(var) - out[var] = np.nan - if not file_path or not Path(file_path).is_file(): - print(f"[WARNING] File for {var} not found: {file_path}") - continue - - try: - ds = safe_open_nc_with_time_decoding(file_path) - if var not in ds: - print(f"[WARNING] Variable {var} not in {file_path}") - continue - - da = ds[var] - dims = set(da.dims) - lat_dim = "lat" if "lat" in dims else "latitude" - lon_dim = "lon" if "lon" in dims else "longitude" - ds = _ensure_sorted(ds, lat_dim, lon_dim) - da = ds[var] - - # — find the name of the time dimension and unify to "time" - dims_list = list(da.dims) - time_dim = "time" if "time" in dims_list else next( - (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims_list), None - ) - if time_dim is None: - raise ValueError(f"No time-like dimension in {var}: dims={dims_list}") - if time_dim != "time": - ds = ds.rename({time_dim: "time"}) - da = ds[var] - dims_list = list(da.dims) - - # — remove unnecessary measurements (pressure, ensemble, etc.) - extra_dims = [d for d in dims_list if d not in ("time", lat_dim, lon_dim)] - if extra_dims: - sel = {} - # special logic for pressure levels: take 1000 hPa if it exists; otherwise the first - level_keys = {"pressure_level", "isobaricInhPa", "level"} - for d in extra_dims: - if d in da.coords and d.lower() in {k.lower() for k in level_keys}: - try: - lev = np.asarray(ds[d].values, dtype=float) - sel[d] = int(np.nanargmin(np.abs(lev - 1000.0))) # closest to 1000 hPa - except Exception: - sel[d] = 0 - else: - sel[d] = 0 - da = da.isel(**sel).squeeze() - - glat = ds[lat_dim].values - glon = ds[lon_dim].values - gtime = pd.to_datetime(ds["time"].values).values - gtime_ts = pd.to_datetime(gtime) - - for idx, row in out.iterrows(): - t = row["timestamp"] - xlat = row["location_lat"] - xlon = row["location_lon"] - - if t < gtime_ts.min() or t > gtime_ts.max(): - continue - - # find k nearest nodes through local window - nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) - - vals = [] - dists = [] - for ii, jj in nn_idx: - series = da.isel({lat_dim: ii, lon_dim: jj}).values # (time,) - v = _interp1d_time(gtime_ts, series, t) - vals.append(v) - dists.append(np.hypot(glat[ii] - xlat, glon[jj] - xlon)) - - out.at[idx, var] = _idw(vals, dists, p=2) - - except Exception as e: - print(f"[ERROR] {var}: {e}") - continue - - out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] - return out, pd.NaT, pd.NaT - -def convert_tif_to_nc_before_annotation(tif_paths, output_dir): - """ - onverts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. - For each variable, builds a data(time, lat, lon) array. - Returns the path to the generated .nc file. - """ - tif_paths = [str(Path(p)) for p in tif_paths] - if not tif_paths: - raise ValueError("No .tif files provided") - - # 1) Group files by variable - by_var = {} - for tif in tif_paths: - vname = parse_appeears_variable_name(tif) - by_var.setdefault(vname, []).append(tif) - - lat = lon = None - data_vars = {} - - for vname, files in by_var.items(): - times = [] - planes = [] - first_geo = True - - for tif in sorted(files): - tif_name = Path(tif).name - t = parse_time_from_filename(tif_name) - times.append(t) - - with rasterio.open(tif) as src: - arr = src.read(1).astype("float32") - nodata = src.nodata - if nodata is not None: - arr = np.where(arr == nodata, np.nan, arr) - - # Read scale_factor from tags (if present); otherwise use a 0.0001 heuristic for int16 NDVI/EVI - scale = None - try: - tags = src.tags() - for k in ("scale_factor", "SCALE", "Scale", "scale"): - if k in tags: - scale = float(tags[k]); break - except Exception: - pass - if scale is None and (np.nanmin(arr) >= -10000) and (np.nanmax(arr) <= 10000): - scale = 0.0001 - if scale is not None: - arr = arr * scale - - planes.append(arr) - - if first_geo: - transform = src.transform - h, w = src.height, src.width - lon = np.array([transform * (i, 0) for i in range(w)])[:, 0] - lat = np.array([transform * (0, j) for j in range(h)])[:, 1] - first_geo = False - - data_array = np.stack(planes) # (time, lat, lon) - time_index = np.array(times) - - da = xr.DataArray( - data_array, - dims=["time", "lat", "lon"], - coords={"time": time_index, "lat": lat, "lon": lon}, - name=vname - ) - data_vars[vname] = da - - ds = xr.Dataset(data_vars) - - base = Path(tif_paths[0]).name.split("_")[0] - safe_base = re.sub(r"[^\w\-]", "_", base) - out = Path(output_dir) / f"{safe_base}_nc_output.nc" - ds.to_netcdf(out) - return str(out) - - - - -def parse_time_from_filename(filename): - """ - Example: MOD13A1.061__500m_16_days_NDVI_doy2014145000000_aid0001.tif - Parses date using "doyYYYYDDD", where DDD is the day of year. - """ - match = re.search(r'doy(\d{4})(\d{3})', filename) - if match: - year, doy = int(match.group(1)), int(match.group(2)) - return datetime.strptime(f"{year}{doy}", "%Y%j") - else: - raise ValueError(f"Cannot parse time from filename: {filename}") - -# --- AppEEARS variable-name parser --- # -def parse_appeears_variable_name(tif_path: str) -> str: - """ - Returns the variable/layer name for an AppEEARS GeoTIFF. - Order: - (A) try reading tags (long_name, DESCRIPTION, Layer...) - (B) if not available — parse the filename: - - token before 'doyYYYYDDD' (typical: ..._NDVI_doy2014145_...) - - or one of the known tokens in KNOWN_TOKENS - (C) fallback -> "data" - """ - - - p = Path(tif_path) - name = p.name - - # A) read TIF tags - try: - with rasterio.open(tif_path) as src: - tags = src.tags() - for key in ("long_name", "DESCRIPTION", "Description", "Layer", "LAYER", "BAND_NAME"): - if key in tags and str(tags[key]).strip(): - raw = str(tags[key]).strip() - var = re.sub(r"[^\w\-]+", "_", raw) - return var - except Exception: - pass - - # B1) token before "doyYYYYDDD" - m = re.search(r"_([A-Za-z0-9][A-Za-z0-9_]+)_doy\d{7}", name) - if m: - return m.group(1) - - # B2) known tokens (common AppEEARS layers; list is incomplete but useful) - KNOWN_TOKENS = { - "NDVI", "EVI", - "LST_Day_1km", "LST_Night_1km", "LST_Day_1KM", "LST_Night_1KM", "QC_Day", "QC_Night", - "Lai_500m", "Fpar_500m", "FparLai_QC", - "Nadir_Reflectance_Band1", "Nadir_Reflectance_Band2", "Nadir_Reflectance_Band3", - "Nadir_Reflectance_Band4", "Nadir_Reflectance_Band5", "Nadir_Reflectance_Band6", - "Nadir_Reflectance_Band7", - "SurfReflect_Band1", "SurfReflect_Band2", "SurfReflect_Band3", - "SurfReflect_Band4", "SurfReflect_Band5", "SurfReflect_Band6", "SurfReflect_Band7", - "NDSI_Snow_Cover", - "VIIRS_NDVI", "VIIRS_EVI", - "BurnDate", "BurnDate_Uncertainty", "LAI", "FPAR", "QC" - } - candidates = sorted([t for t in KNOWN_TOKENS if t in name], key=len, reverse=True) - if candidates: - return candidates[0] - - parts = re.split(r"[_.]", name) - parts = [t for t in parts if t and t.lower() != "tif"] - parts = [t for t in parts if not t.lower().startswith("aid")] - parts = [t for t in parts if not re.fullmatch(r"\d{7,8}", t) and not t.startswith("doy")] - if parts: - parts.sort(key=len, reverse=True) - return parts[0] - - return "data" - - -def _ensure_sorted(ds, lat_dim, lon_dim): - if (np.diff(ds[lat_dim].values) < 0).all(): - ds = ds.sortby(lat_dim) - if (np.diff(ds[lon_dim].values) < 0).all(): - ds = ds.sortby(lon_dim) - return ds - -def _nearest_index(arr, x): - # array arr growing: fast via searchsorted + local check - idx = np.searchsorted(arr, x) - if idx == 0: - return 0 - if idx >= len(arr): - return len(arr) - 1 - return idx if abs(arr[idx] - x) < abs(arr[idx-1] - x) else idx-1 - -def _interp1d_time(grid_times_ts, series_vals, t_target): - """Linear 1D interpolation over time (Timestamp => float64). Ignores NaN in the series.""" - # filter NaN in a series - mask = ~np.isnan(series_vals) - if mask.sum() < 2: - return np.nan - x = grid_times_ts[mask].astype("int64") # ns → int64 - y = series_vals[mask].astype(float) - xi = np.int64(pd.Timestamp(t_target).value) - # if out of range — return NaN - if xi < x.min() or xi > x.max(): - return np.nan - return np.interp(xi, x, y) - -def _k_nearest_indices(glat, glon, xlat, xlon, k): - """Returns an array of indices (ilat, ilon) of length k among candidates from the local window""" - # first the shortest path is the nearest grid - i0 = _nearest_index(glat, xlat) - j0 = _nearest_index(glon, xlon) - - # form a small window around (i0, j0) sufficient to find k neighbors - # empirically: radius r = ceil(max(1, sqrt(k))) → (2r+1)^2 >= k - r = int(np.ceil(max(1, np.sqrt(k)))) - i_min, i_max = max(0, i0 - r), min(len(glat) - 1, i0 + r) - j_min, j_max = max(0, j0 - r), min(len(glon) - 1, j0 + r) - - # collect candidates in the window - cand = [] - for ii in range(i_min, i_max + 1): - for jj in range(j_min, j_max + 1): - d = np.hypot(glat[ii] - xlat, glon[jj] - xlon) - cand.append((d, ii, jj)) - cand.sort(key=lambda t: t[0]) - top = cand[:k] - return [(ii, jj) for _, ii, jj in top] - -def _idw(values, distances, p=2): - """IDW average for already interpolated values. distances > 0 (add eps).""" - vals = np.array(values, dtype=float) - d = np.array(distances, dtype=float) + 1e-12 - w = 1.0 / (d ** p) - # ignore NaN in vals - mask = ~np.isnan(vals) - if not mask.any(): - return np.nan - w_sel = w[mask] - v_sel = vals[mask] - return np.sum(w_sel * v_sel) / np.sum(w_sel) - -# --- NEW: helper --- -def _detect_time_name(ds): - # 1)quick candidates by name - name_candidates = ("time","valid_time","forecast_time","verification_time","t","Time","datetime","date") - for c in name_candidates: - if c in ds.coords or c in ds.variables: - return c - - # 2) CF attributes: standard_name = "time" or units with the word "since" - for name, var in ds.variables.items(): - stdn = str(var.attrs.get("standard_name","")).lower() - units = str(var.attrs.get("units","")) - if stdn == "time": - return name - if "since" in units: - return name - return None \ No newline at end of file diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index 9eff0c3..9c2ac99 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -3,11 +3,14 @@ from pathlib import Path import pandas as pd import re -from shapely.geometry import Point, box +from shapely.geometry import Point import numpy as np from datetime import datetime import rasterio +LEVEL_DIM_CANDIDATES = ("isobaricInhPa","isobaric_in_hPa","level","lev","plev","pressure","pressure_level") + + def safe_open_nc_with_time_decoding(path): """ Opens a NetCDF file with support for non-standard calendars: @@ -56,7 +59,7 @@ def safe_open_nc_with_time_decoding(path): except Exception as e: raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") -#### *** Attempt at optimization + def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str]): """ Return union [nc_start, nc_end] across all selected variables. @@ -78,7 +81,6 @@ def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str ds.close() return nc_start, nc_end -#### *** def get_nc_bounds(nc_path: str): """ @@ -86,20 +88,23 @@ def get_nc_bounds(nc_path: str): """ ds = safe_open_nc_with_time_decoding(nc_path) # candidate coordinate names - lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x") + try: + lat_candidates = ("lat", "latitude", "y") + lon_candidates = ("lon", "longitude", "x","long") - lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) - lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) - if lat_name is None or lon_name is None: - raise ValueError("Could not detect lat/lon coordinate names in NetCDF") + lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + if lat_name is None or lon_name is None: + raise ValueError("Could not detect lat/lon coordinate names in NetCDF") + + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} + finally: + ds.close() - lat_min = float(ds[lat_name].min()) - lat_max = float(ds[lat_name].max()) - lon_min = float(ds[lon_name].min()) - lon_max = float(ds[lon_name].max()) - ds.close() - return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} def load_vector_extent_info(path): try: @@ -114,6 +119,7 @@ def load_vector_extent_info(path): except Exception as e: raise RuntimeError(f"Failed to load vector file: {e}") + def load_taxa_and_ids_from_csv(file_path): """ Reads a Movebank-style CSV and returns: @@ -139,6 +145,7 @@ def load_taxa_and_ids_from_csv(file_path): except Exception as e: return None, [], [], str(e) + def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, out_csv_path=None): @@ -151,7 +158,7 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele """ print("[DEBUG] Annotation started") print("Selected variables:", selected_env_vars) - print("From files:", [env_var_map[v] for v in selected_env_vars]) + print("From files:", [env_var_map.get(v) for v in selected_env_vars]) print("Selected IDs:", selected_ids) print("Movebank file:", movebank_path) print("Boundary file:", boundary_path) @@ -290,7 +297,7 @@ def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=Non return gdf_filtered, output_path -# UNUSED OPTION + def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_end: pd.Timestamp) -> pd.DataFrame: df = df.copy() if nc_start is None or nc_end is None: @@ -302,6 +309,7 @@ def filter_points_within_timerange(df: pd.DataFrame, nc_start: pd.Timestamp, nc_ print(f"[INFO] Time-prefiltered rows: {len(filtered_df)} / {before} within [{nc_start} .. {nc_end}]") return filtered_df + def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: """ Interpolates missing values in 'location_lat' and 'location_lon' columns @@ -332,6 +340,7 @@ def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: df = df.reset_index() return df + def load_selected_environmental_data(df, env_var_map, selected_vars, movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2): """ @@ -353,258 +362,308 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, else: raise ValueError(f"Unknown interpolation method: {interpolation_method}") -def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): + +def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4): """ - Temporal: vectorised linear interpolation over time (per grid-cell group) - Spatial: nearest neighbour (1 grid node per point) + Annotate movement points with environmental values using: + - Spatial: nearest grid node + - Temporal: vectorised linear interpolation in time (per grid cell) + + This version supports "expanded" variable labels that include a pressure/vertical level, + e.g. "v_1000", "v_975", ... For such labels, the base variable ("v") is taken from the + NetCDF, and the closest level to the requested value (e.g. 1000 hPa) is selected along + the appropriate vertical dimension (e.g. isobaricInhPa/level/lev/plev/...). + + Parameters + ---------- + df : pandas.DataFrame + Movebank-like table with columns: timestamp, location_lat, location_lon, etc. + env_var_map : dict[str, str] + Mapping from UI label to NetCDF path, e.g. {"v_1000": "/path/file.nc"}. + selected_vars : list[str] + Labels picked in the UI; labels may be plain vars ("t2m") or var+level ("v_850"). + movebank_path : str + Used only for output file placement upstream in the pipeline. + smoothing_k : int + Unused in the nearest-neighbour branch (kept for signature symmetry). + + Returns + ------- + (out_df, nc_start, nc_end) + `out_df` includes new columns for each selected label; nc_* are placeholders here. + + Notes + ----- + - Assumes `safe_open_nc_with_time_decoding` and `_ensure_sorted` are available in scope. + - Column names in the result exactly match `selected_vars` (e.g. "v_1000"). """ - from shapely.geometry import Point + def _nearest_indices_vectorized(arr, vals): + """ + Fast nearest-index for a (monotonic) 1D array `arr` + against multiple query values `vals` (vectorised). + """ + idx = np.searchsorted(arr, vals) + idx = np.clip(idx, 0, len(arr) - 1) + left = np.maximum(idx - 1, 0) + take_left = (idx > 0) & (np.abs(arr[left] - vals) <= np.abs(arr[idx] - vals)) + return np.where(take_left, left, idx) + # --- input prep ----------------------------------------------------------- out = df.copy() out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) - # placeholders for nearest grid coords + # Placeholders for nearest grid coords (one set; overwritten by last variable) nc_latitudes = np.full(len(out), np.nan, dtype="float64") nc_longitudes = np.full(len(out), np.nan, dtype="float64") - # precompute numeric times for vectorised interp + # Target times for np.interp (int64 ns) tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") - # helper: vectorised nearest-index for monotonic arrays - def _nearest_indices_vectorized(arr, vals): - idx = np.searchsorted(arr, vals) - idx = np.clip(idx, 0, len(arr) - 1) - left = np.maximum(idx - 1, 0) - take_left = (idx > 0) & (np.abs(arr[left] - vals) <= np.abs(arr[idx] - vals)) - return np.where(take_left, left, idx) + # --- main loop over requested labels ------------------------------------- + for label in selected_vars: + file_path = env_var_map.get(label) + out[label] = np.nan # ensure column exists even on failures - for var in selected_vars: - file_path = env_var_map.get(var) - out[var] = np.nan if not file_path or not Path(file_path).is_file(): - print(f"[WARNING] File for {var} not found: {file_path}") + print(f"[WARNING] File for {label} not found: {file_path}") continue + # Split the UI label into (base_var, requested_level) + base_var, target_level = _split_var_and_level(label) + try: ds = safe_open_nc_with_time_decoding(file_path) - if var not in ds: - print(f"[WARNING] Variable {var} not in {file_path}") + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") ds.close() continue - da = ds[var] - # detect dims + da = ds[base_var] dims = list(da.dims) + + # Detect lat/lon names; keep dataset sorted in both lat_dim = "lat" if "lat" in dims else "latitude" lon_dim = "lon" if "lon" in dims else "longitude" ds = _ensure_sorted(ds, lat_dim, lon_dim) - da = ds[var] + da = ds[base_var] dims = list(da.dims) - # unify time dim to "time" + # Unify/ensure time dimension is named 'time' time_dim = "time" if "time" in dims else next( - (d for d in ("valid_time","forecast_time","verification_time","t","Time") if d in dims), None + (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), + None ) if time_dim is None: ds.close() - raise ValueError(f"No time-like dimension in {var}: dims={dims}") + raise ValueError(f"No time-like dimension in '{base_var}': dims={dims}") if time_dim != "time": ds = ds.rename({time_dim: "time"}) - da = ds[var] + da = ds[base_var] dims = list(da.dims) - # slice away extra dims once (pressure level, ensemble, etc.) + # Resolve extra dimensions (pressure level, ensemble, expver, etc.) + # For the "level" dim: pick closest to `target_level` (or 1000 hPa by default). extra = [d for d in dims if d not in ("time", lat_dim, lon_dim)] if extra: sel = {} for d in extra: - dl = d.lower() - try: - coord = ds.coords[d] if d in ds.coords else ds[d] - except Exception: - coord = None - if dl in ("pressure_level", "isobaricinhpa", "level"): - idx = 0 - if coord is not None: - try: - vals = np.asarray(coord.values, dtype=float) - idx = int(np.nanargmin(np.abs(vals - 1000.0))) - except Exception: - idx = 0 - sel[d] = idx + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) else: - sel[d] = 0 - da = da.isel(**sel).squeeze() # -> (time, lat, lon) + sel[d] = 0 # deterministic default for non-level extra dims + da = da.isel(**sel).squeeze() # now expected shape: (time, lat, lon) + # Grid coordinate vectors glat = ds[lat_dim].values glon = ds[lon_dim].values gtime = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") - # Vectorised nearest-cell lookup for all point + # Vectorised nearest grid-node indices for all points lat_idx = _nearest_indices_vectorized(glat, out["location_lat"].to_numpy(dtype="float64")) lon_idx = _nearest_indices_vectorized(glon, out["location_lon"].to_numpy(dtype="float64")) - # Store nc_lat/nc_lon + # Store the matched grid coordinates (useful for QA) nc_latitudes[:] = glat[lat_idx] nc_longitudes[:] = glon[lon_idx] - # Group by grid cell; map cell code -> unique index + # Group points by grid cell (to read each per-cell time series only once) cell_code = (lat_idx.astype(np.int64) * len(glon)) + lon_idx.astype(np.int64) unique_cells, inverse = np.unique(cell_code, return_inverse=True) - # Cache per-cell time series (to avoid re-reading the .nc repeatedly) - series_cache = {} + # Cache of per-cell series: (ii, jj) -> 1D float64 array over time + series_cache: dict[tuple[int, int], np.ndarray] = {} + col_idx = out.columns.get_loc(label) - # Vectorised interpolation for each group for g, code in enumerate(unique_cells): ii = int(code // len(glon)) jj = int(code % len(glon)) - # take all positions in this cell - pos = np.nonzero(inverse == g)[0] - xi = tgt_times[pos] + pos = np.nonzero(inverse == g)[0] # row indices in `out` for this cell + xi = tgt_times[pos] # target times (int64 ns) - # we read the time series of this cell only once key = (ii, jj) if key not in series_cache: - # .values ​​reads (time,) one series; with dask it's 1 read/calculation + # Read the cell time series once; cast to float64 for np.interp series_cache[key] = da.isel({lat_dim: ii, lon_dim: jj}).values.astype("float64") y = series_cache[key] - # mask of valid + # Valid-only mask for temporal interpolation m = np.isfinite(y) if m.sum() < 2: - out.iloc[pos, out.columns.get_loc(var)] = np.nan + out.iloc[pos, col_idx] = np.nan continue - x = gtime[m] - yy = y[m] + x = gtime[m] # source times (int64) + yy = y[m] # source values - # np.interp: fast, but does not put NaN out of range - will set it ourselves vals = np.interp(xi, x, yy) + # Outside native time range → NaN (np.interp would extend) vals[(xi < x.min()) | (xi > x.max())] = np.nan - out.iloc[pos, out.columns.get_loc(var)] = vals + out.iloc[pos, col_idx] = vals ds.close() except Exception as e: - print(f"[ERROR] {var}: {e}") + print(f"[ERROR] {label}: {e}") continue + # Final QA columns out["nc_lat"] = nc_latitudes out["nc_lon"] = nc_longitudes out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] + + # Harmonise return signature with the rest of your pipeline return out, pd.NaT, pd.NaT + def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): """ - Temporal: linear (1D), vectorised in time via np.interp with a cache of per-cell time series. - Spatial: IDW over k nearest grid nodes (k = smoothing_k). - No external libraries. Cache: - - series_cache[(ii, jj)] -> (x_valid_int64, y_valid_float64) for cell (lat_idx, lon_idx). - This removes repeated da.isel(...).values calls for the same neighbouring cells across rows. + Annotate movement points with environmental values using: + - Spatial: Inverse Distance Weighting (IDW) over k nearest grid nodes + - Temporal: 1D linear interpolation in time (per grid node), vectorised via np.interp + + This version understands expanded variable labels that include a pressure/vertical level, + e.g. "v_1000", "v_975". It will: + 1) parse the UI label into (base_var, target_level), + 2) find a known vertical dimension (isobaricInhPa/level/lev/plev/...), + 3) slice the DataArray to the closest level to `target_level` (or 1000 hPa by default). + + Parameters + ---------- + df : pandas.DataFrame + Movebank-like table with columns: timestamp, location_lat, location_lon, etc. + env_var_map : dict[str, str] + Mapping from UI label to NetCDF path, e.g. {"v_1000": "/path/file.nc"}. + selected_vars : list[str] + Labels picked in the UI; each label becomes a column in the output. + movebank_path : str + Kept for signature symmetry with the rest of the pipeline (output path handled upstream). + smoothing_k : int + Number of nearest grid nodes for IDW (>=2). + + Returns + ------- + (out_df, nc_start, nc_end) + `out_df` contains new columns with the same names as `selected_vars`. + `nc_start`, `nc_end` are placeholders here (NaT). """ - from shapely.geometry import Point - + # --- input prep ---------------------------------------------------------------- k = max(2, int(smoothing_k)) out = df.copy() out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + + # Keep nc_lat/nc_lon semantics consistent with prior implementation (copy of point coords) out["nc_lat"] = out["location_lat"].values out["nc_lon"] = out["location_lon"].values - # Target times as int64 ns (for fast np.interp) + # Vectorised numeric targets for temporal interpolation tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") lat_vals = out["location_lat"].to_numpy(dtype="float64") lon_vals = out["location_lon"].to_numpy(dtype="float64") - for var in selected_vars: - file_path = env_var_map.get(var) - out[var] = np.nan + # --- main loop over labels ----------------------------------------------------- + for label in selected_vars: + file_path = env_var_map.get(label) + out[label] = np.nan # ensure the column exists even if we skip/err + if not file_path or not Path(file_path).is_file(): - print(f"[WARNING] File for {var} not found: {file_path}") + print(f"[WARNING] File for {label} not found: {file_path}") continue + # Split label into base variable and optional requested level + base_var, target_level = _split_var_and_level(label) + try: ds = safe_open_nc_with_time_decoding(file_path) - if var not in ds: - print(f"[WARNING] Variable {var} not in {file_path}") + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not in {file_path}") ds.close() continue - da = ds[var] + da = ds[base_var] dims = list(da.dims) + + # Detect coordinate names and sort dataset (required by nearest/k-nearest search) lat_dim = "lat" if "lat" in dims else "latitude" lon_dim = "lon" if "lon" in dims else "longitude" ds = _ensure_sorted(ds, lat_dim, lon_dim) - da = ds[var] + da = ds[base_var] dims = list(da.dims) - + # Unify time dimension name to 'time' time_dim = "time" if "time" in dims else next( (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), None ) if time_dim is None: ds.close() - raise ValueError(f"No time-like dimension in {var}: dims={dims}") + raise ValueError(f"No time-like dimension in '{base_var}': dims={dims}") if time_dim != "time": ds = ds.rename({time_dim: "time"}) - da = ds[var] + da = ds[base_var] dims = list(da.dims) - # Remove unnecessary measurements (pressure/ensemble/expver → 0th or closest to 1000 hPa) + # Resolve extra dimensions (pressure level, ensemble, expver, etc.) extra_dims = [d for d in dims if d not in ("time", lat_dim, lon_dim)] if extra_dims: sel = {} for d in extra_dims: - dl = d.lower() - try: - coord = ds.coords[d] if d in ds.coords else ds[d] - except Exception: - coord = None - if dl in ("pressure_level", "isobaricinhpa", "level"): - idx = 0 - if coord is not None: - try: - vals = np.asarray(coord.values, dtype=float) - idx = int(np.nanargmin(np.abs(vals - 1000.0))) - except Exception: - idx = 0 - sel[d] = idx + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) else: - sel[d] = 0 + sel[d] = 0 # deterministic default for non-level dims da = da.isel(**sel).squeeze() # -> (time, lat, lon) + # Coordinate vectors glat = ds[lat_dim].values glon = ds[lon_dim].values gtime_int = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") - # *** CACHE of per-cell time series *** + # Cache per-grid-node time series (to avoid repeated reads for neighbors) # key: (ii, jj) -> (x_int64_valid, y_float64_valid) series_cache: dict[tuple[int, int], tuple[np.ndarray, np.ndarray]] = {} - col_idx = out.columns.get_loc(var) + col_idx = out.columns.get_loc(label) - # # Main loop over rows (without repeated da.isel reads now) - for idx in range(len(out)): - t_i = tgt_times[idx] - xlat = lat_vals[idx] - xlon = lon_vals[idx] + # Row-wise IDW over k nearest grid nodes + for i in range(len(out)): + t_i = tgt_times[i] + xlat = lat_vals[i] + xlon = lon_vals[i] - # out of time range → NaN + # If outside the native time span → keep NaN if t_i < gtime_int.min() or t_i > gtime_int.max(): continue - # find k neighbors (window around nearest) - nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) + nn_idx = _k_nearest_indices(glat, glon, xlat, xlon, k) # provided elsewhere vals = np.empty(k, dtype="float64") dists = np.empty(k, dtype="float64") for j, (ii, jj) in enumerate(nn_idx): key = (ii, jj) if key not in series_cache: - # read the cell's time series once + # Read cell time series once; keep only valid points for interp y = da.isel({lat_dim: ii, lon_dim: jj}).values.astype("float64") m = np.isfinite(y) if m.sum() >= 2: @@ -620,21 +679,23 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: vals[j] = np.nan else: v = np.interp(t_i, x, yy) + # clamp to NaN if extrapolated if (t_i < x.min()) or (t_i > x.max()): v = np.nan vals[j] = v - # Geodistance (planar Euclidean in degrees; as before) + # Planar Euclidean distance in degrees (consistent with prior code) dists[j] = np.hypot(glat[ii] - xlat, glon[jj] - xlon) - out.iloc[idx, col_idx] = _idw(vals, dists, p=2) + out.iloc[i, col_idx] = _idw(vals, dists, p=2) # provided elsewhere ds.close() except Exception as e: - print(f"[ERROR] {var}: {e}") + print(f"[ERROR] {label}: {e}") continue + # Geometry for QA/exports out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] return out, pd.NaT, pd.NaT @@ -709,7 +770,6 @@ def convert_tif_to_nc_before_annotation(tif_paths, output_dir): data_vars[vname] = da ds = xr.Dataset(data_vars) - base = Path(tif_paths[0]).name.split("_")[0] safe_base = re.sub(r"[^\w\-]", "_", base) out = Path(output_dir) / f"{safe_base}_nc_output.nc" @@ -717,8 +777,6 @@ def convert_tif_to_nc_before_annotation(tif_paths, output_dir): return str(out) - - def parse_time_from_filename(filename): """ Example: MOD13A1.061__500m_16_days_NDVI_doy2014145000000_aid0001.tif @@ -730,7 +788,8 @@ def parse_time_from_filename(filename): return datetime.strptime(f"{year}{doy}", "%Y%j") else: raise ValueError(f"Cannot parse time from filename: {filename}") - + + # --- AppEEARS variable-name parser --- # def parse_appeears_variable_name(tif_path: str) -> str: """ @@ -742,8 +801,6 @@ def parse_appeears_variable_name(tif_path: str) -> str: - or one of the known tokens in KNOWN_TOKENS (C) fallback -> "data" """ - - p = Path(tif_path) name = p.name @@ -800,6 +857,7 @@ def _ensure_sorted(ds, lat_dim, lon_dim): ds = ds.sortby(lon_dim) return ds + def _nearest_index(arr, x): # array arr growing: fast via searchsorted + local check idx = np.searchsorted(arr, x) @@ -809,19 +867,6 @@ def _nearest_index(arr, x): return len(arr) - 1 return idx if abs(arr[idx] - x) < abs(arr[idx-1] - x) else idx-1 -def _interp1d_time(grid_times_ts, series_vals, t_target): - """Linear 1D interpolation over time (Timestamp => float64). Ignores NaN in the series.""" - # filter NaN in a series - mask = ~np.isnan(series_vals) - if mask.sum() < 2: - return np.nan - x = grid_times_ts[mask].astype("int64") # ns → int64 - y = series_vals[mask].astype(float) - xi = np.int64(pd.Timestamp(t_target).value) - # if out of range — return NaN - if xi < x.min() or xi > x.max(): - return np.nan - return np.interp(xi, x, y) def _k_nearest_indices(glat, glon, xlat, xlon, k): """Returns an array of indices (ilat, ilon) of length k among candidates from the local window""" @@ -845,6 +890,7 @@ def _k_nearest_indices(glat, glon, xlat, xlon, k): top = cand[:k] return [(ii, jj) for _, ii, jj in top] + def _idw(values, distances, p=2): """IDW average for already interpolated values. distances > 0 (add eps).""" vals = np.array(values, dtype=float) @@ -858,7 +904,7 @@ def _idw(values, distances, p=2): v_sel = vals[mask] return np.sum(w_sel * v_sel) / np.sum(w_sel) -# --- NEW: helper --- + def _detect_time_name(ds): # 1)quick candidates by name name_candidates = ("time","valid_time","forecast_time","verification_time","t","Time","datetime","date") @@ -874,4 +920,37 @@ def _detect_time_name(ds): return name if "since" in units: return name - return None \ No newline at end of file + return None + + +def _split_var_and_level(label: str): + """ + If the name is in the format _, returns ('var', target_level_float). + Otherwise ('label', None). + """ + m = re.match(r"^([A-Za-z_]\w*)_(\d{2,4})$", str(label)) + if m: + base = m.group(1) + try: + lvl = float(m.group(2)) + except Exception: + lvl = None + return base, lvl + return label, None + + +def _pick_level_index(ds, level_dim: str, target_level: float | None): + """ + Returns the level index: + - if target_level is given, the closest to it; + - otherwise, the closest to 1000 hPa; + - if error, 0. + """ + try: + vals = np.asarray(ds[level_dim].values, dtype=float) + if vals.size == 0: + return 0 + ref = 1000.0 if target_level is None else float(target_level) + return int(np.nanargmin(np.abs(vals - ref))) + except Exception: + return 0 \ No newline at end of file diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index 0566b9f..eb1ff29 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -10,7 +10,7 @@ from datetime import datetime import re from ecodata import validate_and_process_csv, load_vector_extent_info, load_taxa_and_ids_from_csv -from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only +from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only, delete_files from ecodata.annotation_eng_func import start_annotation_process,convert_tif_to_nc_before_annotation, get_nc_bounds, safe_open_nc_with_time_decoding logger = logging.getLogger(__file__) @@ -327,7 +327,6 @@ def __init__(self, **params): self.tif_interpolation_method.param.watch(self._update_smoothing_options_tif, 'value') - @try_catch("Error loading Individual IDs") def load_ids_from_file(self, *events): self.status_text = "Loading IDs..." @@ -392,6 +391,7 @@ def update_annotation_ids_by_taxon(self, event): self.id_multiselect.options = ids self.id_multiselect.value = ids + @try_catch("Error generating CSV") def run_make_csv(self, *events): try: @@ -458,6 +458,7 @@ def _set_time_slider_from_df(self, df: pd.DataFrame): self.time_selection_ID.end = tmax self.time_selection_ID.value = (tmin, tmax) + @try_catch("Error merging files from folder") def run_merge_files(self, *events): try: @@ -475,6 +476,7 @@ def run_merge_files(self, *events): self.alert.object = self.status_text + @try_catch("Error loading environmental data") def load_env_data(self, *events): """We select exactly one .nc, update File/Time/Spatial and the list of 3D variables.""" @@ -513,7 +515,7 @@ def load_env_data(self, *events): # Auxiliary coordinate name candidates time_candidates = ("time","Time","datetime","date","valid_time","forecast_time","verification_time") lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x") + lon_candidates = ("lon", "longitude", "x", "long") try: ds = safe_open_nc_with_time_decoding(nc_path) @@ -535,11 +537,40 @@ def load_env_data(self, *events): lon_max = float(ds[lon_name].max()) spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" - # List of 3D variables (have at least 3 dimensions) - + # ---- Перелік змінних з підтримкою вертикальних рівнів ---- + LEVEL_DIM_CANDIDATES = ("isobaricInhPa", "isobaric_in_hPa", "level", "lev", "plev", "pressure", "pressure_level") + for var in ds.data_vars: da = ds[var] - if da.ndim >= 3: + if da.ndim < 3: + continue # нам потрібні щонайменше time/lat/lon + + dims = list(da.dims) + + # шукаємо назву координати рівня серед типових для ERA5/ECMWF + level_dim = next((d for d in LEVEL_DIM_CANDIDATES if d in dims), None) + + if level_dim is None: + # звичайна 3D-змінна без рівнів — як і раніше var_file_map[var] = nc_path + continue + + # якщо є рівні — додаємо по опції на кожен рівень: var_1000, var_975, ... + try: + level_vals = ds[level_dim].values + except Exception: + level_vals = [] + + for lv in level_vals: + try: + # за замовчуванням показуємо цілими hPa (1000, 975, 950 …) + lv_int = int(round(float(lv))) + label = f"{var}_{lv_int}" + var_file_map[label] = nc_path + except Exception: + # якщо рівень нечисловий — пропускаємо конкретне значення + continue + finally: ds.close() except Exception as e: @@ -566,7 +597,8 @@ def load_env_data(self, *events): self.status_text = f"Loaded {len(var_file_map)} variable(s) from 1 file." self.alert.object = self.status_text self._sync_nc_column_heights() - + + @try_catch("Error loading boundary data") def load_boundary_data(self, *events): self.status_text = "Loading boundary data..." @@ -605,6 +637,7 @@ def load_boundary_data(self, *events): self.alert.object = self.status_text self._sync_nc_column_heights() + @try_catch("Error loading movement data") def load_movement_data(self, *events): self.status_text = "Loading movement data..." @@ -724,6 +757,7 @@ def run_annotation(self, *events): self.alert.object = self.status_text + ####TIF @try_catch("Error loading TIF environmental data") def load_env_data_tif(self, *events): @@ -871,6 +905,7 @@ def load_env_data_tif(self, *events): ) self.alert.object = self.status_text + @try_catch("Error running TIF annotation") def run_annotation_tif(self, *events): """ @@ -902,7 +937,7 @@ def run_annotation_tif(self, *events): self.status_text = "Starting annotation (TIF)…" self.alert.object = self.status_text - # --- 0) Validate inputs --------------------------------------------------- + # --- 0) Validate inputs --- # Movebank CSV (required) movebank_path = getattr(self.tif_movement_data_selector, "value", None) if not movebank_path or not Path(str(movebank_path)).is_file(): @@ -983,7 +1018,6 @@ def run_annotation_tif(self, *events): self.alert.object = self.status_text return - # --- 4) Which variables to annotate? -------------------------------------- ms_widget = getattr(self, "tif_env_data_multiselect", None) selected_vars = list(getattr(ms_widget, "value", [])) if ms_widget else [] @@ -1078,6 +1112,7 @@ def load_boundary_data_tif(self, *events): self.status_text = f"Failed to read vector file: {e}" self.alert.object = self.status_text + @try_catch("Error loading TIF movement data") def load_movement_data_tif(self, *events): self.status_text = "Loading TIF movement data..." @@ -1131,6 +1166,7 @@ def load_movement_data_tif(self, *events): self.tif_movement_info.object = "
".join(lines) self.alert.object = self.status_text + @try_catch("Interpolation (missing only) failed") def run_interpolate_missing_only(self, *events): # 1) input @@ -1184,6 +1220,7 @@ def run_interpolate_missing_only(self, *events): self.status_text = "Interpolation complete. No files created (no eligible gaps ≤ 1 day)." self.alert.object = self.status_text + def update_annotation_ids_by_taxon_tif(self, event): if self.df is None: return @@ -1227,6 +1264,7 @@ def update_movement_info_text(self, section, new_values): updated_lines.append(line) self.movement_info.object = "
".join(updated_lines) + def update_env_info_text_tif(self, selected_vars): current = self.tif_env_info.object or "" if not current: @@ -1244,6 +1282,7 @@ def update_env_info_text_tif(self, selected_vars): updated.insert(1, f"Environment parameters: {', '.join(selected_vars) if selected_vars else '-'}") self.tif_env_info.object = "
".join(updated) + def update_movement_info_text_tif(self, section, new_values): current = self.tif_movement_info.object or "" if not current: @@ -1298,10 +1337,12 @@ def _section(self, title, *items, height=None): height=height, ) + def _auto_height(self, pane, line_px=22, padding=8): lines = [l for l in (pane.object or "").split("
") if l.strip()] pane.height = line_px * max(1, len(lines)) + padding + def _update_smoothing_options(self, event): """Updates options for control_smoothing depending on interpolation method (.nc).""" if event.new.startswith("Nearest neighbor"): @@ -1312,6 +1353,7 @@ def _update_smoothing_options(self, event): if self.control_smoothing.value == "1": self.control_smoothing.value = "4" + def _update_smoothing_options_tif(self, event): """Updates options for control_smoothing depending on interpolation method(.tif).""" if event.new.startswith("Nearest neighbor"): @@ -1322,6 +1364,7 @@ def _update_smoothing_options_tif(self, event): if self.tif_control_smoothing.value == "1": self.tif_control_smoothing.value = "4" + def _sync_nc_column_heights(self): """Adjusts the height of the 2nd and 3rd columns to the 1st.""" first = getattr(self, "_nc_col1", None) @@ -1335,6 +1378,7 @@ def _sync_nc_column_heights(self): else: self._apply_nc_height_from_first() + def _apply_nc_height_from_first(self): first = self._nc_col1 if not first: @@ -1345,6 +1389,7 @@ def _apply_nc_height_from_first(self): self._nc_col2.height = h self._nc_col3.height = h + def reset_boundary_data(self, *events): """ Resets boundary to default: no file selected, range = environment boundary (.nc). @@ -1364,9 +1409,9 @@ def reset_boundary_data(self, *events): self.status_text = "Boundary reset to default (auto from .nc)." self.alert.object = self.status_text - self._sync_nc_column_heights() + @register_view() def view(): viewer = movebank_annotation_engine() diff --git a/ecodata/movebank_functions.py b/ecodata/movebank_functions.py index 1d4a4fb..cb22f36 100644 --- a/ecodata/movebank_functions.py +++ b/ecodata/movebank_functions.py @@ -22,11 +22,6 @@ TIME_COLUMN = 'timestamp' # Set to "eobs:start-timestamp" or "timestamp" as needed -# --- Utilities --- -from datetime import datetime -import pandas as pd -import re - def parse_timestamp(s: str) -> datetime: """ Robust timestamp parser: @@ -561,22 +556,28 @@ def split_into_sessions(data, max_gap_minutes): "eobs_start_timestamp", "eobs_temperature", "ground_speed", "height_above_ellipsoid" ] - try: - df_check = normalize_column_names(pd.read_csv(session_output_path, low_memory=False)) - numeric_cols_to_fix = [col for col in cols_to_check_for_nan if col in df_check.columns and df_check[col].dtype in ["float64", "int64"] and df_check[col].isna().any()] - - if numeric_cols_to_fix: - df_check["timestamp"] = pd.to_datetime(df_check["timestamp"], errors="coerce") - df_check = df_check.set_index("timestamp") - df_check[numeric_cols_to_fix] = df_check[numeric_cols_to_fix].interpolate(method="time", limit_direction="both") - df_check = df_check.reset_index() - df_check.to_csv(session_output_path, index=False) - print(f"Interpolated missing values in: {numeric_cols_to_fix} for file {session_output_path}") - except Exception as e: - print(f"Interpolation post-check failed for {session_output_path}: {e}") - - result_paths.append(session_output_path) - print(f"Subset {session_output_path} has been created.") + if result_paths: # перевірка, що є створені файли + last_file = result_paths[-1] + try: + df_check = normalize_column_names(pd.read_csv(last_file, low_memory=False)) + numeric_cols_to_fix = [ + col for col in cols_to_check_for_nan + if col in df_check.columns + and df_check[col].dtype in ["float64", "int64"] + and df_check[col].isna().any() + ] + + if numeric_cols_to_fix: + df_check["timestamp"] = pd.to_datetime(df_check["timestamp"], errors="coerce") + df_check = df_check.set_index("timestamp") + df_check[numeric_cols_to_fix] = df_check[numeric_cols_to_fix].interpolate( + method="time", limit_direction="both" + ) + df_check = df_check.reset_index() + df_check.to_csv(last_file, index=False) + print(f"Interpolated missing values in: {numeric_cols_to_fix} for file {last_file}") + except Exception as e: + print(f"Interpolation post-check failed for {last_file}: {e}") return result_paths @@ -590,9 +591,8 @@ def merge_csv_files_from_folder(folder_path: Path, delete_empty_columns: bool) - delete_empty_columns (bool): If True, remove non-overlapping columns. Returns: - tuple: (merged DataFrame, list of removed column names) + tuple: (merged DataFrame, list of removed column names, list of source CSV file paths) """ - csv_files = sorted(folder_path.glob("*.csv")) if not csv_files: raise ValueError("No CSV files found in the selected folder.") @@ -606,7 +606,7 @@ def merge_csv_files_from_folder(folder_path: Path, delete_empty_columns: bool) - merged_df = pd.concat(cleaned_dataframes, ignore_index=True) else: merged_df = pd.concat(dataframes, ignore_index=True) - return merged_df, sorted(missing_columns) + return merged_df, sorted(missing_columns), [str(p) for p in csv_files] # --- Filename --- def safe_filename(name: str, replacement: str = "_") -> str: @@ -973,4 +973,23 @@ def resolve_id_key(fieldnames) -> str | None: nk = _norm_key(cand) if nk in norm_to_orig: return norm_to_orig[nk] - return None \ No newline at end of file + return None + +def delete_files(paths: list[str], keep: list[str] | None = None) -> list[str]: + """ + Delete files by absolute/relative paths. + Returns a list of successfully deleted paths. + """ + from pathlib import Path + keep_set = {str(Path(k).resolve()) for k in (keep or [])} + deleted = [] + for p in paths: + try: + rp = str(Path(p).resolve()) + if rp in keep_set: + continue + Path(rp).unlink(missing_ok=True) + deleted.append(rp) + except Exception as e: + print(f"[delete_files] Failed to delete {p}: {e}") + return deleted \ No newline at end of file From e671115b76ffe7c248243667376450020705d036 Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Thu, 22 Jan 2026 00:16:40 -0700 Subject: [PATCH 04/17] annotation app: updated selection of env coordinate names - add auto-detection for time, lat, long, x, y variable names - add GUI options for geographic (lat/long) or projected(x/y) - update functions to take dict of env variable name mapping --- ecodata/annotation_eng_func.py | 262 +++++++++++++++++----- ecodata/app/apps/annotation_engine_app.py | 254 ++++++++++++++++----- 2 files changed, 405 insertions(+), 111 deletions(-) diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index 9c2ac99..455dc58 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -11,21 +11,88 @@ LEVEL_DIM_CANDIDATES = ("isobaricInhPa","isobaric_in_hPa","level","lev","plev","pressure","pressure_level") -def safe_open_nc_with_time_decoding(path): +def open_nc_metadata(path: str) -> xr.Dataset: + """ + Open a NetCDF dataset for metadata inspection only. + + Notes + ----- + - This function is intended for UI/metadata purposes (listing variables/coords/dims). + - It does not decode time and should avoid heavy computation. + + Parameters + ---------- + path : str + Path to a NetCDF file. + + Returns + ------- + xarray.Dataset + Opened dataset (time not decoded). + """ + # decode_times=False prevents CF time decoding and avoids cftime edge cases during UI inspection + return xr.open_dataset(path, decode_times=False, chunks="auto") + + +def detect_env_coord_names(ds: xr.Dataset) -> dict: + """ + Detect coordinate names for an environmental dataset. + + Parameters + ---------- + ds : xarray.Dataset + Environmental dataset. + + Returns + ------- + dict + Dictionary with keys: 'env_time', 'env_x', 'env_y', 'env_lat', 'env_lon'. + Values may be None if not detected. + """ + + # time + env_time = _detect_time_name(ds) + + # projected axes + x_candidates = ["x", "X", "projection_x_coordinate", "eastings", "easting"] + y_candidates = ["y", "Y", "projection_y_coordinate", "northings", "northing"] + + env_x = next((c for c in x_candidates if c in ds.coords and c in ds.dims), None) + env_y = next((c for c in y_candidates if c in ds.coords and c in ds.dims), None) + + # geographic coords (can be 1D or 2D) + lat_candidates = ["lat", "latitude", "Latitude"] + lon_candidates = ["lon", "longitude", "long", "Longitude"] + + env_lat = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + env_lon = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + + return { + "env_time": env_time, + "env_x": env_x, + "env_y": env_y, + "env_lat": env_lat, + "env_lon": env_lon, + } + + +def safe_open_nc_with_time_decoding(path, time_name: str | None = None): """ Opens a NetCDF file with support for non-standard calendars: julian, gregorian, 360_day, noleap, etc. - Always returns the 'time' coordinate as a pd.DatetimeIndex, + Always returns the 'time' coordinate as a pd.DatetimeIndex, even if it was originally of cftime type. """ try: ds = xr.open_dataset(path, decode_times=False, chunks="auto") - time_name = _detect_time_name(ds) + if time_name is None: + time_name = _detect_time_name(ds) if time_name is None: raise ValueError("No time-like coordinate/variable found (e.g., 'time', 'valid_time').") + # if time is in variables but not in coords — make it a coordinate if time_name in ds.variables and time_name not in ds.coords: ds = ds.set_coords(time_name) @@ -58,19 +125,20 @@ def safe_open_nc_with_time_decoding(path): except Exception as e: raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") - -def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str]): + +def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str], time_name: str | None = None): """ Return union [nc_start, nc_end] across all selected variables. If time is missing for all → (None, None). """ + nc_start, nc_end = None, None for v in (selected_env_vars or []): nc_path = env_var_map.get(v) if not nc_path: continue - ds = safe_open_nc_with_time_decoding(nc_path) + ds = safe_open_nc_with_time_decoding(nc_path, time_name=time_name) try: if ("time" in ds.coords) or ("time" in ds.variables): tmin = pd.to_datetime(ds["time"].values.min()) @@ -82,26 +150,25 @@ def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str return nc_start, nc_end -def get_nc_bounds(nc_path: str): +def get_nc_bounds(nc_path: str, env_coord_names: dict | None = None): """ Returns a dictionary of boundaries from .nc in CRS WGS84: {"S": ..., "N": ..., "W": ..., "E": ...} """ + env_coord_names = env_coord_names or {} + time_name = env_coord_names.get("env_time") + lat_name = env_coord_names.get("env_lat") + lon_name = env_coord_names.get("env_lon") + ds = safe_open_nc_with_time_decoding(nc_path) - # candidate coordinate names try: - lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x","long") - - lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) - lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) if lat_name is None or lon_name is None: - raise ValueError("Could not detect lat/lon coordinate names in NetCDF") + raise ValueError("Could not determine lat/lon bounds.") lat_min = float(ds[lat_name].min()) lat_max = float(ds[lat_name].max()) lon_min = float(ds[lon_name].min()) lon_max = float(ds[lon_name].max()) - return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} + return {"S": lat_min, "N": lat_max, "W": lon_min, "E": lon_max} finally: ds.close() @@ -118,7 +185,7 @@ def load_vector_extent_info(path): return path, south, north, west, east except Exception as e: raise RuntimeError(f"Failed to load vector file: {e}") - + def load_taxa_and_ids_from_csv(file_path): """ @@ -144,17 +211,18 @@ def load_taxa_and_ids_from_csv(file_path): except Exception as e: return None, [], [], str(e) - + def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, - out_csv_path=None): + out_csv_path=None, env_coord_names: dict | None = None): """ env_var_map: dict[str, str] — variable → file path selected_env_vars: list[str] — selected variables movebank_path: str — path to the Movebank CSV selected_ids: list[str] — IDs for annotation boundary_path: str — path to .shp or .geojson + env_coord_names: dict — mapping of coordinate names for env datasets """ print("[DEBUG] Annotation started") print("Selected variables:", selected_env_vars) @@ -169,19 +237,20 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele if df_filtered.empty: print("[WARNING] No points within the boundary.") return - + # ===*** Time prefiltering (union across selected variables) === - nc_start, nc_end = get_nc_timerange_for_selected(env_var_map, selected_env_vars) + time_var = env_coord_names.get("env_time") if env_coord_names else None + nc_start, nc_end = get_nc_timerange_for_selected(env_var_map, selected_env_vars, time_name=time_var) df_filtered = filter_points_within_timerange(df_filtered, nc_start, nc_end) if df_filtered.empty: print("[WARNING] No points within the NC time window after prefiltering.") return - # ===*** + # ===*** # === Step 2: Loading and interpolation of environmental data === result = load_selected_environmental_data(df_filtered, env_var_map, selected_env_vars, movebank_path, - interpolation_method, smoothing_k=smoothing_k) + interpolation_method, smoothing_k=smoothing_k, env_coord_names=env_coord_names,) if result is None: print("[ERROR] Environmental data was not loaded.") return @@ -266,7 +335,7 @@ def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=Non except Exception as e: print(f"[ERROR] Failed to save (bbox) data: {e}") return gdf_filtered, output_path - + # case: boundary from shp/geojson df["geometry"] = [Point(lon, lat) for lon, lat in zip(df["location_lon"], df["location_lat"])] gdf_points = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326") @@ -342,7 +411,8 @@ def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: def load_selected_environmental_data(df, env_var_map, selected_vars, - movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2): + movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2, + env_coord_names: dict | None = None): """ Wrapper that calls the appropriate annotation function depending on the interpolation method. Supports: @@ -356,14 +426,14 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, is_idw = ("idw" in label) or ("inverse distance" in label) if is_nearest: - return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) + return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k, env_coord_names=env_coord_names) elif is_idw: - return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) + return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k, env_coord_names=env_coord_names) else: raise ValueError(f"Unknown interpolation method: {interpolation_method}") - -def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4): + +def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4, env_coord_names: dict | None = None): """ Annotate movement points with environmental values using: - Spatial: nearest grid node @@ -386,6 +456,9 @@ def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothin Used only for output file placement upstream in the pipeline. smoothing_k : int Unused in the nearest-neighbour branch (kept for signature symmetry). + env_coord_names : dict | None + Mapping of coordinate names for env datasets. Expected keys: + 'env_time', 'env_x', 'env_y', 'env_lat', 'env_lon'. Returns ------- @@ -433,7 +506,14 @@ def _nearest_indices_vectorized(arr, vals): base_var, target_level = _split_var_and_level(label) try: - ds = safe_open_nc_with_time_decoding(file_path) + env_coord_names = env_coord_names or {} + time_name = env_coord_names.get("env_time") + lat_name = env_coord_names.get("env_lat") + lon_name = env_coord_names.get("env_lon") + x_name = env_coord_names.get("env_x") + y_name = env_coord_names.get("env_y") + + ds = safe_open_nc_with_time_decoding(file_path, time_name=time_name) if base_var not in ds: print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") ds.close() @@ -442,25 +522,47 @@ def _nearest_indices_vectorized(arr, vals): da = ds[base_var] dims = list(da.dims) - # Detect lat/lon names; keep dataset sorted in both - lat_dim = "lat" if "lat" in dims else "latitude" - lon_dim = "lon" if "lon" in dims else "longitude" + # coordinate/dimension selection + lat_dim = lat_name if (lat_name in dims) else None + lon_dim = lon_name if (lon_name in dims) else None + + if lat_dim is None or lon_dim is None: + # Optional strict fallback to x/y if user provided them AND they are dims + x_dim = x_name if (x_name in dims) else None + y_dim = y_name if (y_name in dims) else None + + if x_dim is not None and y_dim is not None: + lat_dim = y_dim + lon_dim = x_dim + else: + ds.close() + raise ValueError( + "Could not resolve spatial dimensions from the provided env_coord_names.\n" + f" Requested lat dim: {lat_name!r} (is_dim={lat_name in dims if lat_name else False})\n" + f" Requested lon dim: {lon_name!r} (is_dim={lon_name in dims if lon_name else False})\n" + f" Requested y dim: {y_name!r} (is_dim={y_name in dims if y_name else False})\n" + f" Requested x dim: {x_name!r} (is_dim={x_name in dims if x_name else False})\n" + f" Available dims for {base_var!r}: {dims}" + ) + ds = _ensure_sorted(ds, lat_dim, lon_dim) da = ds[base_var] dims = list(da.dims) - # Unify/ensure time dimension is named 'time' - time_dim = "time" if "time" in dims else next( - (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), - None - ) - if time_dim is None: + # Validate that lat/lon dims are 1D coordinate vectors + glat = np.asarray(ds[lat_dim].values) + glon = np.asarray(ds[lon_dim].values) + if glat.ndim != 1 or glon.ndim != 1: + ds.close() + raise ValueError( + f"Nearest-grid method requires 1D coordinate vectors for '{lat_dim}' and '{lon_dim}'. " + f"Got shapes: {lat_dim}={glat.shape}, {lon_dim}={glon.shape}." + ) + + # time dim should already be 'time' because safe_open... renames it, but keep the fallback + if "time" not in dims: ds.close() - raise ValueError(f"No time-like dimension in '{base_var}': dims={dims}") - if time_dim != "time": - ds = ds.rename({time_dim: "time"}) - da = ds[base_var] - dims = list(da.dims) + raise ValueError(f"No 'time' dim after decoding for '{base_var}'. dims={dims}") # Resolve extra dimensions (pressure level, ensemble, expver, etc.) # For the "level" dim: pick closest to `target_level` (or 1000 hPa by default). @@ -538,7 +640,14 @@ def _nearest_indices_vectorized(arr, vals): return out, pd.NaT, pd.NaT -def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): +def annotate_env_IDW( + df, + env_var_map, + selected_vars, + movebank_path, + smoothing_k: int = 2, + env_coord_names: dict | None = None, +): """ Annotate movement points with environmental values using: - Spatial: Inverse Distance Weighting (IDW) over k nearest grid nodes @@ -562,7 +671,9 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: Kept for signature symmetry with the rest of the pipeline (output path handled upstream). smoothing_k : int Number of nearest grid nodes for IDW (>=2). - + env_coord_names : dict | None + Mapping of coordinate names for env datasets. Expected keys: + 'env_time', 'env_x', 'env_y', 'env_lat', 'env_lon'. Returns ------- (out_df, nc_start, nc_end) @@ -597,7 +708,15 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: base_var, target_level = _split_var_and_level(label) try: - ds = safe_open_nc_with_time_decoding(file_path) + env_coord_names = env_coord_names or {} + time_name = env_coord_names.get("env_time") + lat_name = env_coord_names.get("env_lat") + lon_name = env_coord_names.get("env_lon") + x_name = env_coord_names.get("env_x") + y_name = env_coord_names.get("env_y") + + ds = safe_open_nc_with_time_decoding(file_path, time_name=time_name) + if base_var not in ds: print(f"[WARNING] Base variable '{base_var}' not in {file_path}") ds.close() @@ -606,24 +725,47 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: da = ds[base_var] dims = list(da.dims) - # Detect coordinate names and sort dataset (required by nearest/k-nearest search) - lat_dim = "lat" if "lat" in dims else "latitude" - lon_dim = "lon" if "lon" in dims else "longitude" + # coordinate/dimension selection + lat_dim = lat_name if (lat_name in dims) else None + lon_dim = lon_name if (lon_name in dims) else None + + if lat_dim is None or lon_dim is None: + # Strict fallback to x/y only if user provided them AND they are dims + x_dim = x_name if (x_name in dims) else None + y_dim = y_name if (y_name in dims) else None + + if x_dim is not None and y_dim is not None: + lat_dim = y_dim + lon_dim = x_dim + else: + ds.close() + raise ValueError( + "Could not resolve spatial dimensions from the provided env_coord_names.\n" + f" Requested lat dim: {lat_name!r} (is_dim={lat_name in dims if lat_name else False})\n" + f" Requested lon dim: {lon_name!r} (is_dim={lon_name in dims if lon_name else False})\n" + f" Requested y dim: {y_name!r} (is_dim={y_name in dims if y_name else False})\n" + f" Requested x dim: {x_name!r} (is_dim={x_name in dims if x_name else False})\n" + f" Available dims for {base_var!r}: {dims}" + ) + ds = _ensure_sorted(ds, lat_dim, lon_dim) da = ds[base_var] dims = list(da.dims) - # Unify time dimension name to 'time' - time_dim = "time" if "time" in dims else next( - (d for d in ("valid_time", "forecast_time", "verification_time", "t", "Time") if d in dims), None - ) - if time_dim is None: + # Validate that lat/lon dims are 1D coordinate vectors + glat = np.asarray(ds[lat_dim].values) + glon = np.asarray(ds[lon_dim].values) + if glat.ndim != 1 or glon.ndim != 1: + ds.close() + raise ValueError( + f"IDW method requires 1D coordinate vectors for '{lat_dim}' and '{lon_dim}'. " + f"Got shapes: {lat_dim}={glat.shape}, {lon_dim}={glon.shape}." + ) + + # time dim should already be 'time' because safe_open... renames it + if "time" not in dims: ds.close() - raise ValueError(f"No time-like dimension in '{base_var}': dims={dims}") - if time_dim != "time": - ds = ds.rename({time_dim: "time"}) - da = ds[base_var] - dims = list(da.dims) + raise ValueError(f"No 'time' dim after decoding for '{base_var}'. dims={dims}") # Resolve extra dimensions (pressure level, ensemble, expver, etc.) extra_dims = [d for d in dims if d not in ("time", lat_dim, lon_dim)] @@ -907,7 +1049,7 @@ def _idw(values, distances, p=2): def _detect_time_name(ds): # 1)quick candidates by name - name_candidates = ("time","valid_time","forecast_time","verification_time","t","Time","datetime","date") + name_candidates = ("time", "timestamp", "Timestamp", "Time", "valid_time", "forecast_time", "verification_time", "t", "Time", "datetime", "date") for c in name_candidates: if c in ds.coords or c in ds.variables: return c diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index eb1ff29..9fcf049 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -3,15 +3,16 @@ import panel as pn import param import pandas as pd +import xarray as xr from panel.io.loading import start_loading_spinner, stop_loading_spinner from ecodata.app.models import FileSelector from ecodata.panel_utils import param_widget, register_view, try_catch, rename_param_widgets from ecodata.app.config import DEFAULT_TEMPLATE from datetime import datetime import re -from ecodata import validate_and_process_csv, load_vector_extent_info, load_taxa_and_ids_from_csv -from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only, delete_files -from ecodata.annotation_eng_func import start_annotation_process,convert_tif_to_nc_before_annotation, get_nc_bounds, safe_open_nc_with_time_decoding +from ecodata import validate_and_process_csv, load_vector_extent_info, load_taxa_and_ids_from_csv +from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only, delete_files +from ecodata.annotation_eng_func import start_annotation_process,convert_tif_to_nc_before_annotation, get_nc_bounds, open_nc_metadata, detect_env_coord_names, safe_open_nc_with_time_decoding logger = logging.getLogger(__file__) @@ -68,7 +69,20 @@ class movebank_annotation_engine(param.Parameterized): load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary") load_bound_button = pn.widgets.Button(name="Load boundary data", button_type="primary") reset_bound_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary") - env_data_multiselect = pn.widgets.MultiSelect(name="Environmental variables (use Ctrl for multiple)", options=[], height = 140 ) + + # Selections for netcdf variable labels + env_time_select = pn.widgets.Select(name="Env time coordinate", options=[], value=None) + env_spatial_mode = pn.widgets.RadioButtonGroup(name="Env spatial coordinate mode", + options=["Geographic (lat/lon)", "Projected (x/y)"], + value="Geographic (lat/lon)", + button_type="default", + ) + env_lat_select = pn.widgets.Select(name="Latitude", options=[], value=None) + env_lon_select = pn.widgets.Select(name="Longitude", options=[], value=None) + env_x_select = pn.widgets.Select(name="X coordinate", options=[], value=None) + env_y_select = pn.widgets.Select(name="Y coordinate", options=[], value=None) + + env_data_multiselect = pn.widgets.MultiSelect(name="Environmental variables (use Ctrl for multiple)", options=[], height = 140 ) taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl for multiple)", height = 140) id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl for multiple)", height = 140) env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", @@ -93,7 +107,7 @@ class movebank_annotation_engine(param.Parameterized): value="Inverse Distance Weighting (time-linear)" ) make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") - + status_text = param.String("Ready...") #TIF widgets @@ -155,7 +169,7 @@ def __init__(self, **params): "make_csv", "merge_files", "delete_individual_ID_files","folder_to_merge", "delete_empty_columns", "out_merged_csv_name", - "merge_files_button", + "merge_files_button", # === NC Annotation tab === "env_data_selector", "bound_data_selector", "movement_data_selector", @@ -163,13 +177,13 @@ def __init__(self, **params): "load_movement_button", "env_data_multiselect", "taxon_multiselect", "id_multiselect", "boundary_info_str", "interpolation_method", - "control_smoothing", + "control_smoothing", "env_info", "movement_info" ,"output_path", "make_annotation_button", # === TIF Annotation tab === "tif_env_data_selector", "tif_movement_data_selector", - "tif_bound_data_selector","tif_reset_bound_button", + "tif_bound_data_selector","tif_reset_bound_button", "tif_env_data_multiselect", "tif_taxon_multiselect", "tif_id_multiselect", @@ -178,15 +192,20 @@ def __init__(self, **params): "tif_make_annotation_button" ] ) - + self._latlon_widgets = pn.Column(self.env_lat_select, self.env_lon_select, sizing_mode="stretch_width") + self._xy_widgets = pn.Column(self.env_x_select, self.env_y_select, sizing_mode="stretch_width") self.df = None self.alert = pn.pane.Markdown(self.status_text) - NC_H = 1080 + NC_H = 1080 # === NC tab === self._nc_col1 = self._section( "Environmental data (.nc)", pn.Column(self.env_data_selector, sizing_mode="stretch_width"), self.load_env_button, + self.env_time_select, + self.env_spatial_mode, + self._latlon_widgets, + self._xy_widgets, self.env_data_multiselect, self.env_info, self.interpolation_method, @@ -224,7 +243,7 @@ def __init__(self, **params): ) # TIF - TIF_H = 1080 + TIF_H = 1080 self._tif_col1 = self._section( "Environmental data (.tif) - select one (of)", pn.Column(self.tif_env_data_selector, sizing_mode="stretch_width"), @@ -240,7 +259,7 @@ def __init__(self, **params): self._tif_col2 = self._section( "Movebank data (.csv)", - pn.Column(self.tif_movement_data_selector, sizing_mode="stretch_width"), + pn.Column(self.tif_movement_data_selector, sizing_mode="stretch_width"), self.tif_load_movement_button, self.tif_taxon_multiselect, self.tif_movement_info, @@ -249,7 +268,7 @@ def __init__(self, **params): self._tif_col3 = self._section( "Boundary data (.shp/.geojson)", - pn.Column(self.tif_bound_data_selector, sizing_mode="stretch_width"), + pn.Column(self.tif_bound_data_selector, sizing_mode="stretch_width"), pn.Row(self.tif_load_bound_button, self.tif_reset_bound_button), self.tif_id_multiselect, self.tif_boundary_info_str, @@ -297,8 +316,8 @@ def __init__(self, **params): ("Annotation engine - .tif", self.anotation_engine_tif_tab), ("Crop & interpolate csv", self.crop_interpolate_tab), ("Merge csv", self.merge_tab), - ) - + ) + self.simple_interp_button.on_click(self.run_interpolate_missing_only) self.load_data_button.on_click(self.load_ids_from_file) self.make_csv.on_click(self.run_make_csv) @@ -325,7 +344,11 @@ def __init__(self, **params): self.tif_taxon_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("Taxons", e.new), "value") self.tif_id_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("IDs", e.new), "value") self.tif_interpolation_method.param.watch(self._update_smoothing_options_tif, 'value') - + + # environmental spatial mode + self.env_spatial_mode.param.watch(lambda e: self._apply_env_spatial_mode(), "value") + self._apply_env_spatial_mode() # set initial enabled/disabled state + @try_catch("Error loading Individual IDs") def load_ids_from_file(self, *events): @@ -443,7 +466,7 @@ def _set_time_slider_from_df(self, df: pd.DataFrame): candidates = ("timestamp", "eobs_start_timestamp", "time", "datetime", "date") time_col = next((c for c in candidates if c in df.columns), None) if not time_col: - return + return ts = pd.to_datetime(df[time_col], errors="coerce") ts = ts[ts.notna()] @@ -475,11 +498,24 @@ def run_merge_files(self, *events): self.status_text = f"Failed: {e}" self.alert.object = self.status_text - + @try_catch("Error loading environmental data") def load_env_data(self, *events): - """We select exactly one .nc, update File/Time/Spatial and the list of 3D variables.""" + """ + Load a single environmental NetCDF for UI inspection (metadata only). + + This method: + 1) Opens the dataset without time decoding (fast metadata read), + 2) Detects candidate coordinate names (time/x/y/lat/lon), + 3) Populates the variable list (including pressure-level-expanded labels), + 4) Updates the info pane (File/Spatial + optionally Time if readily decodable). + + Notes + ----- + - This function is intended for UI responsiveness. It avoids CF-time decoding + unless explicitly needed later (e.g., during annotation). + """ self.status_text = "Loading environmental data..." self.alert.object = self.status_text @@ -509,33 +545,73 @@ def load_env_data(self, *events): self._auto_height(self.env_info) var_file_map: dict[str, str] = {} - time_text = "-" + time_text = "-" # will remain "-" unless we can decode reliably/cheaply spatial_text = "-" - # Auxiliary coordinate name candidates - time_candidates = ("time","Time","datetime","date","valid_time","forecast_time","verification_time") - lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x", "long") - try: - ds = safe_open_nc_with_time_decoding(nc_path) + ds = open_nc_metadata(nc_path) try: - # ---- TIME ---- - time_name = next((c for c in time_candidates if c in ds.coords or c in ds.variables), None) - if time_name is not None: - tmin = pd.to_datetime(ds[time_name].values.min()) - tmax = pd.to_datetime(ds[time_name].values.max()) - time_text = f"{tmin.strftime('%Y-%m-%d')} — {tmax.strftime('%Y-%m-%d')}" - - # ---- SPATIAL ---- - lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) - lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) - if lat_name and lon_name: - lat_min = float(ds[lat_name].min()) - lat_max = float(ds[lat_name].max()) - lon_min = float(ds[lon_name].min()) - lon_max = float(ds[lon_name].max()) - spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" + # Autodetect coordinate names + coord_guess = detect_env_coord_names(ds) + + # Populate dropdown menus + self._populate_env_coord_dropdowns(ds, coord_guess) + + # Auto-set spatial mode based on available coordinates + has_latlon = bool(self.env_lat_select.value and self.env_lon_select.value) + has_xy = bool(self.env_x_select.value and self.env_y_select.value) + + if has_latlon and not has_xy: + self.env_spatial_mode.value = "Geographic (lat/lon)" + elif has_xy and not has_latlon: + self.env_spatial_mode.value = "Projected (x/y)" + # if both exist, don’t override user choice + + + env_time = self.env_time_select.value + env_lat = self.env_lat_select.value + env_lon = self.env_lon_select.value + env_x = self.env_x_select.value + env_y = self.env_y_select.value + + # Update spatial_text with range from dataset (prefer lat/lon, fallback to x/y) + if env_lat and env_lon and (env_lat in ds) and (env_lon in ds): + try: + lat_min = float(ds[env_lat].min()) + lat_max = float(ds[env_lat].max()) + lon_min = float(ds[env_lon].min()) + lon_max = float(ds[env_lon].max()) + spatial_text = ( + f"{env_lat}[{lat_min:.3f}..{lat_max:.3f}], " + f"{env_lon}[{lon_min:.3f}..{lon_max:.3f}]" + ) + except Exception: + spatial_text = "-" + elif env_x and env_y and (env_x in ds) and (env_y in ds): + # Projected coordinates (units may be meters) + try: + x_min = float(ds[env_x].min()) + x_max = float(ds[env_x].max()) + y_min = float(ds[env_y].min()) + y_max = float(ds[env_y].max()) + spatial_text = ( + f"{env_y}[{y_min:.3f}..{y_max:.3f}], " + f"{env_x}[{x_min:.3f}..{x_max:.3f}]" + ) + except Exception: + spatial_text = "-" + + # Update time_text with time range if time decoding is cheap + if env_time and (env_time in ds.coords or env_time in ds.variables): + try: + # Only attempt lightweight decode when CF-like units are present + decoded_times = xr.decode_cf(ds[[env_time]], decode_times=True)[env_time] + tmin = decoded_times.min() + tmax = decoded_times.max() + time_text = f"{tmin.strftime('%Y-%m-%d')} — {tmax.strftime('%Y-%m-%d')}" + except Exception: + time_text = "-" + # ---- Перелік змінних з підтримкою вертикальних рівнів ---- LEVEL_DIM_CANDIDATES = ("isobaricInhPa", "isobaric_in_hPa", "level", "lev", "plev", "pressure", "pressure_level") @@ -704,6 +780,7 @@ def run_annotation(self, *events): self.status_text = "Running annotation..." self.alert.object = self.status_text try: + env_coord_names = self._get_env_coord_names_from_ui() selected_vars = self.env_data_multiselect.value selected_ids = self.id_multiselect.value env_var_map = getattr(self, "env_variable_sources", {}) @@ -730,7 +807,7 @@ def run_annotation(self, *events): return try: - bounds = get_nc_bounds(nc_path) # {"S":..., "N":..., "W":..., "E":...} + bounds = get_nc_bounds(nc_path, env_coord_names=env_coord_names ) # {"S":..., "N":..., "W":..., "E":...} bbox = bounds # Updating the border information panel self.boundary_info_str.object = ( @@ -748,7 +825,7 @@ def run_annotation(self, *events): start_annotation_process( env_var_map, selected_vars, movebank_path, selected_ids, boundary_path, interpolation_method, bbox=bbox, smoothing_k=smoothing_points, - out_csv_path=self.output_path.value + out_csv_path=self.output_path.value, env_coord_names=env_coord_names ) self.status_text = "Annotation finished." @@ -1060,8 +1137,8 @@ def run_annotation_tif(self, *events): selected_ids=selected_ids, boundary_path=str(boundary_path) if boundary_path else None, interpolation_method=interp_method, - bbox=bbox, - smoothing_k=int(self.tif_control_smoothing.value), + bbox=bbox, + smoothing_k=int(self.tif_control_smoothing.value), out_csv_path=output_csv_path ) self.status_text = "Annotation finished successfully (TIF)." @@ -1076,7 +1153,7 @@ def run_annotation_tif(self, *events): except Exception: pass - + @try_catch("Error loading TIF boundary data") def load_boundary_data_tif(self, *events): self.status_text = "Loading TIF boundary data..." @@ -1142,7 +1219,7 @@ def load_movement_data_tif(self, *events): if lines: lines[0] = f"File: {Path(file_path).name}" - # + # try: ts = pd.to_datetime(df["timestamp"], errors="coerce") lat = pd.to_numeric(df["location_lat"], errors="coerce") @@ -1336,7 +1413,7 @@ def _section(self, title, *items, height=None): sizing_mode="stretch_width", height=height, ) - + def _auto_height(self, pane, line_px=22, padding=8): lines = [l for l in (pane.object or "").split("
") if l.strip()] @@ -1387,7 +1464,7 @@ def _apply_nc_height_from_first(self): if h is None: return self._nc_col2.height = h - self._nc_col3.height = h + self._nc_col3.height = h def reset_boundary_data(self, *events): @@ -1409,7 +1486,82 @@ def reset_boundary_data(self, *events): self.status_text = "Boundary reset to default (auto from .nc)." self.alert.object = self.status_text - self._sync_nc_column_heights() + self._sync_nc_column_heights() + + def _populate_env_coord_dropdowns(self, ds: xr.Dataset, coord_guess: dict) -> None: + """ + Populate environmental coordinate dropdown menus from a dataset and autodetection. + + Parameters + ---------- + ds : xarray.Dataset + Environmental dataset opened for metadata inspection. + coord_guess : dict + Output from `detect_env_coord_names(ds)`. Expected keys: + 'env_time', 'env_x', 'env_y', 'env_lat', 'env_lon'. Values may be None. + + Returns + ------- + None + Updates UI widgets in-place. + """ + # Options: include both coords and variables (some datasets store coords as variables) + coord_names = list(ds.coords.keys()) + var_names = list(ds.variables.keys()) # includes coords too, but that's fine + options = sorted(set(coord_names) | set(var_names)) + + # Helper to set widget options + default value safely + def _set_select(widget: pn.widgets.Select, guess_value: str | None) -> None: + widget.options = options + if guess_value in options: + widget.value = guess_value + else: + widget.value = widget.value if widget.value in options else None + + _set_select(self.env_time_select, coord_guess.get("env_time")) + _set_select(self.env_lat_select, coord_guess.get("env_lat")) + _set_select(self.env_lon_select, coord_guess.get("env_lon")) + _set_select(self.env_x_select, coord_guess.get("env_x")) + _set_select(self.env_y_select, coord_guess.get("env_y")) + + def _get_env_coord_names_from_ui(self) -> dict: + mode = self.env_spatial_mode.value + env_time = self.env_time_select.value + + if not env_time: + raise ValueError("Select Env time coordinate.") + + if mode == "Geographic (lat/lon)": + if not self.env_lat_select.value or not self.env_lon_select.value: + raise ValueError("Select both Env latitude coordinate and Env longitude coordinate.") + return { + "env_time": env_time, + "env_lat": self.env_lat_select.value, + "env_lon": self.env_lon_select.value, + "env_x": None, + "env_y": None, + } + + # Projected (x/y) + if not self.env_x_select.value or not self.env_y_select.value: + raise ValueError("Select both Env x coordinate and Env y coordinate.") + + return { + "env_time": env_time, + "env_lat": None, + "env_lon": None, + "env_x": self.env_x_select.value, + "env_y": self.env_y_select.value, + } + + + def _apply_env_spatial_mode(self) -> None: + mode = self.env_spatial_mode.value + + use_latlon = (mode == "Geographic (lat/lon)") + + self._latlon_widgets.visible = use_latlon + self._xy_widgets.visible = not use_latlon @register_view() From 4cbeecd9fc9eb44232480cb748b4f1c717ac66a9 Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Thu, 22 Jan 2026 01:45:59 -0700 Subject: [PATCH 05/17] Add option for bilinear interpolation in projected datasets (e.g. NARR) --- ecodata/annotation_eng_func.py | 287 +++++++++++++++++++++- ecodata/app/apps/annotation_engine_app.py | 31 ++- 2 files changed, 305 insertions(+), 13 deletions(-) diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index 455dc58..dab981b 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -7,6 +7,7 @@ import numpy as np from datetime import datetime import rasterio +from pyproj import CRS, Transformer LEVEL_DIM_CANDIDATES = ("isobaricInhPa","isobaric_in_hPa","level","lev","plev","pressure","pressure_level") @@ -418,17 +419,26 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, Supports: - "Nearest neighbour (time-linear)" - "IDW (time-linear)" + - "Bilinear (projected x/y, time-linear)" + """ label = (interpolation_method or "").strip().lower() label = label.replace("neighbor", "neighbour") # Normalise US/UK spelling is_nearest = label.startswith("nearest") is_idw = ("idw" in label) or ("inverse distance" in label) + is_bilinear = "bilinear" in label + if is_nearest: return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k, env_coord_names=env_coord_names) elif is_idw: return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k, env_coord_names=env_coord_names) + elif is_bilinear: + return annotate_env_bilinear_projected( + df, env_var_map, selected_vars, movebank_path, + env_coord_names=env_coord_names + ) else: raise ValueError(f"Unknown interpolation method: {interpolation_method}") @@ -841,7 +851,168 @@ def annotate_env_IDW( out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] return out, pd.NaT, pd.NaT +def annotate_env_bilinear_projected( + df, + env_var_map, + selected_vars, + movebank_path, + env_coord_names: dict | None = None, +): + """ + Annotate movement points with environmental values using: + - Spatial: bilinear interpolation on a 1D projected grid (x/y) + - Temporal: linear interpolation in time (xarray interp) + + Tracks input: + - requires lon/lat columns: location_lon, location_lat + - projects lon/lat -> x/y into the env dataset's native CRS using CF metadata + + Env input: + - dataset has 1D x and y coordinate vectors (projected grid) + - dataset provides CF projection metadata so `read_crs_from_cf()` can infer CRS + + Returns: (out_df, pd.NaT, pd.NaT) for signature compatibility. + """ + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + + # Require lon/lat (your code already normalizes movement columns sometimes) + required = ["timestamp", "location_lat", "location_lon"] + out = out.dropna(subset=required) + + env_coord_names = env_coord_names or {} + time_name = env_coord_names.get("env_time") # optional + x_name = env_coord_names.get("env_x") + y_name = env_coord_names.get("env_y") + + if not x_name or not y_name: + raise ValueError( + "Bilinear (projected) requires env_coord_names['env_x'] and ['env_y'] " + "(Projected (x/y) mode)." + ) + if env_coord_names.get("env_lat") or env_coord_names.get("env_lon"): + raise ValueError("Bilinear (projected) requires Projected (x/y) spatial mode, not Geographic (lat/lon).") + # Target time values (vectorized) + tgt_t = out["timestamp"].to_numpy("datetime64[ns]") + + # Track lon/lat arrays + lon = pd.to_numeric(out["location_lon"], errors="coerce").to_numpy(dtype="float64") + lat = pd.to_numeric(out["location_lat"], errors="coerce").to_numpy(dtype="float64") + + # Drop any rows with bad numeric lon/lat + good = np.isfinite(lon) & np.isfinite(lat) & out["timestamp"].notna().to_numpy() + if not good.all(): + out = out.loc[good].copy() + tgt_t = tgt_t[good] + lon = lon[good] + lat = lat[good] + + # QA columns + out["x"] = np.nan + out["y"] = np.nan + + # Cache CRS/transformer per file path (since you may have multiple labels/files) + crs_cache: dict[str, "CRS"] = {} + + for label in selected_vars: + file_path = env_var_map.get(label) + out[label] = np.nan + + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {label} not found: {file_path}") + continue + + base_var, target_level = _split_var_and_level(label) + + try: + ds = safe_open_nc_with_time_decoding(file_path, time_name=time_name) + + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") + ds.close() + continue + + da = ds[base_var] + dims = list(da.dims) + + # Must be able to interpolate along x/y dims + x_dim = x_name if x_name in dims else None + y_dim = y_name if y_name in dims else None + if x_dim is None or y_dim is None: + ds.close() + raise ValueError( + f"Bilinear requires x/y to be dims of {base_var!r}.\n" + f" Requested x dim: {x_name!r} (is_dim={x_name in dims})\n" + f" Requested y dim: {y_name!r} (is_dim={y_name in dims})\n" + f" Available dims: {dims}" + ) + + # Sort for interpolation stability + ds = _ensure_sorted(ds, y_dim, x_dim) + da = ds[base_var] + dims = list(da.dims) + + if "time" not in dims: + ds.close() + raise ValueError(f"No 'time' dim after decoding for '{base_var}'. dims={dims}") + + # Validate 1D x/y coordinate vectors + gx = np.asarray(ds[x_dim].values) + gy = np.asarray(ds[y_dim].values) + if gx.ndim != 1 or gy.ndim != 1: + ds.close() + raise ValueError( + f"Bilinear method requires 1D coordinate vectors for '{y_dim}' and '{x_dim}'. " + f"Got shapes: {y_dim}={gy.shape}, {x_dim}={gx.shape}." + ) + + # Handle extra dims (pressure level, ensemble, expver, etc.) + extra_dims = [d for d in dims if d not in ("time", y_dim, x_dim)] + if extra_dims: + sel = {} + for d in extra_dims: + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) + else: + sel[d] = 0 + da = da.isel(**sel).squeeze() # -> (time, y, x) + + # --- CRS inference + projection lon/lat -> x/y ------------------------- + if file_path not in crs_cache: + # Prefer variable-specific grid_mapping lookup by passing base_var + crs_cache[file_path] = read_crs_from_cf(ds, var_name=base_var) + + target_crs = crs_cache[file_path] + x_pts, y_pts = project_tracks_lonlat_to_xy(lon, lat, target_crs=target_crs) + + # Store for QA + out["x"] = x_pts + out["y"] = y_pts + + # --- vectorized xarray interpolation ----------------------------------- + pts = xr.Dataset( + coords={"points": np.arange(len(out))}, + data_vars={ + "time": ("points", tgt_t), + x_dim: ("points", x_pts), + y_dim: ("points", y_pts), + }, + ) + + sampled = da.interp({x_dim: pts[x_dim], y_dim: pts[y_dim], "time": pts["time"]}) + out[label] = sampled.to_numpy() + + ds.close() + + except Exception as e: + print(f"[ERROR] {label}: {e}") + continue + + # If you want: geometry in projected CRS (x,y). Comment out if not needed. + out["geometry"] = [Point(x, y) for x, y in zip(out["x"], out["y"])] + + return out, pd.NaT, pd.NaT def convert_tif_to_nc_before_annotation(tif_paths, output_dir): """ Converts a list of .tif files into a single NetCDF, creating a separate DataArray per variable. @@ -1095,4 +1266,118 @@ def _pick_level_index(ds, level_dim: str, target_level: float | None): ref = 1000.0 if target_level is None else float(target_level) return int(np.nanargmin(np.abs(vals - ref))) except Exception: - return 0 \ No newline at end of file + return 0 + +def read_crs_from_cf(ds: xr.Dataset, var_name: str | None = None) -> CRS: + """ + Infer the projected coordinate reference system (CRS) of a gridded + environmental dataset using CF-convention metadata. + + The function attempts, in order: + 1) to read a CF-compliant ``grid_mapping`` attribute from a data variable, + 2) to construct a CRS from global dataset attributes (e.g. WKT or PROJ), + 3) to read CRS information from a standalone ``crs`` variable. + + This is intended for datasets on projected grids (e.g. NARR, ERA5-Land, + regional climate models) where track data in WGS84 lon/lat must be + transformed to native x/y coordinates before spatial interpolation. + + Parameters + ---------- + ds : xarray.Dataset + Environmental dataset containing projected horizontal coordinates + and CF-compliant projection metadata. + var_name : str or None, optional + Name of a data variable whose ``grid_mapping`` attribute should be + inspected first. If None, variable-specific metadata are skipped. + + Returns + ------- + pyproj.CRS + Coordinate reference system describing the dataset's native + horizontal projection. + + Raises + ------ + ValueError + If no usable CRS information can be inferred from the dataset. + """ + + # 1) If a data variable is given, try its grid_mapping attribute + grid_mapping_name = None + if var_name is not None and var_name in ds: + grid_mapping_name = ds[var_name].attrs.get("grid_mapping") + + # 2) If we have a grid mapping variable, parse it as CF + if grid_mapping_name and grid_mapping_name in ds.variables: + gm = ds[grid_mapping_name] + # xarray keeps attrs as dict; pyproj can build CRS from CF dict + try: + return CRS.from_cf(gm.attrs) + except Exception: + pass + + # 3) Common alternate places: global attrs + # Try "crs_wkt", "spatial_ref" (GDAL), "proj4", "proj" + for key in ("crs_wkt", "spatial_ref", "proj_wkt", "wkt"): + wkt = ds.attrs.get(key) + if isinstance(wkt, str) and wkt.strip(): + return CRS.from_wkt(wkt) + + for key in ("proj4", "proj4text", "proj", "projection"): + proj = ds.attrs.get(key) + if isinstance(proj, str) and proj.strip(): + return CRS.from_string(proj) + + # 4) Sometimes there is a standalone "crs" variable with WKT in attrs + if "crs" in ds.variables: + crs_var = ds["crs"] + for key in ("crs_wkt", "spatial_ref"): + wkt = crs_var.attrs.get(key) + if isinstance(wkt, str) and wkt.strip(): + return CRS.from_wkt(wkt) + # Or CF attrs + try: + return CRS.from_cf(crs_var.attrs) + except Exception: + pass + + raise ValueError("Could not infer CRS from dataset (no usable CF grid_mapping / WKT / proj string found).") + + +def project_tracks_lonlat_to_xy( + lon: np.ndarray, + lat: np.ndarray, + target_crs: CRS, +) -> tuple[np.ndarray, np.ndarray]: + """ + Project track locations from geographic coordinates (longitude, latitude) + to the native x/y coordinate system of a projected environmental grid. + + This function is used to transform animal tracking locations + (WGS84 lon/lat) into the coordinate system of gridded datasets such as + NARR before spatial interpolation using xarray. + + Parameters + ---------- + lon : array-like + Longitudes of track locations in degrees east (EPSG:4326). + lat : array-like + Latitudes of track locations in degrees north (EPSG:4326). + target_crs : pyproj.CRS + Target projected CRS describing the environmental dataset grid. + + Returns + ------- + x : numpy.ndarray + Projected x-coordinates of track locations in the target CRS. + y : numpy.ndarray + Projected y-coordinates of track locations in the target CRS. + """ + + lon = np.asarray(lon, dtype=float) + lat = np.asarray(lat, dtype=float) + + transformer = Transformer.from_crs("EPSG:4326", target_crs, always_xy=True) + x, y = transformer.transform(lon, lat) + return np.asarray(x, dtype=float), np.asarray(y, dtype=float) \ No newline at end of file diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index 9fcf049..5c0b3fe 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -103,7 +103,7 @@ class movebank_annotation_engine(param.Parameterized): ) interpolation_method = pn.widgets.Select( name="Interpolation method (spatial)", - options=["Nearest neighbor (time-linear)", "Inverse Distance Weighting (time-linear)"], + options=["Nearest neighbor (time-linear)", "Inverse Distance Weighting (time-linear)", "Bilinear (projected x/y, time-linear)"], value="Inverse Distance Weighting (time-linear)" ) make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") @@ -806,19 +806,26 @@ def run_annotation(self, *events): self.alert.object = self.status_text return - try: - bounds = get_nc_bounds(nc_path, env_coord_names=env_coord_names ) # {"S":..., "N":..., "W":..., "E":...} - bbox = bounds - # Updating the border information panel + # Only attempt lat/lon bbox when we are in Geographic mode + if self.env_spatial_mode.value == "Geographic (lat/lon)": + try: + bounds = get_nc_bounds(nc_path, env_coord_names=env_coord_names) + bbox = bounds + self.boundary_info_str.object = ( + "Boundary file: not selected (auto from .nc)
" + f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " + f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to derive boundary from .nc: {e}" + self.alert.object = self.status_text + return + else: + # Projected mode: don't attempt lat/lon bbox self.boundary_info_str.object = ( - "Boundary file: not selected (auto from .nc)
" - f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " - f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + "Boundary file: not selected
" + "Spatial range: using projected grid extent (x/y); bbox cropping disabled." ) - except Exception as e: - self.status_text = f"Failed to derive boundary from .nc: {e}" - self.alert.object = self.status_text - return self.status_text = "Annotation started." # pass bbox (or None, if the user did choose shp) From bb06c5567f99f0b92f1fbe44224224a26d8ac698 Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Thu, 9 Apr 2026 16:53:40 -0600 Subject: [PATCH 06/17] make sure set_time_encoding_modis function uses float dtype --- ecodata/xr_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecodata/xr_tools.py b/ecodata/xr_tools.py index 6051013..0e12ef1 100644 --- a/ecodata/xr_tools.py +++ b/ecodata/xr_tools.py @@ -458,6 +458,6 @@ def set_time_encoding_modis(ds): ds : xarray.Dataset Dataset for which the encodings will be modified. This function will modify the encoding format in place. """ - modis_encoding = {'units': 'days since 2000-01-01', 'calendar': 'julian'} + modis_encoding = {'units': 'days since 2000-01-01', 'calendar': 'julian', "dtype": "float64"} for key in modis_encoding: ds.time.encoding[key] = modis_encoding[key] \ No newline at end of file From a916a19041805cbd5eecf830c8467cf94c6e3921 Mon Sep 17 00:00:00 2001 From: olekshche Date: Wed, 13 May 2026 11:35:23 +0300 Subject: [PATCH 07/17] Add Presence Data Preparation, Multidimensional Annotation, and NC Builder apps --- docs/apps/user_guide/annotation_engine.md | 26 + docs/apps/user_guide/index.md | 4 + .../user_guide/multidimensional_annotation.md | 23 + docs/apps/user_guide/nc_builder.md | 25 + .../user_guide/presence_data_preparation.md | 24 + ecodata-dev-env.yml | 1 + ecodata-env.yml | 2 + ecodata/__init__.py | 11 + ecodata/annotation_eng_func.py | 469 +++- ecodata/app/apps/__init__.py | 4 + ecodata/app/apps/annotation_engine_app.py | 665 ++++- ecodata/app/apps/gridded_data_explorer_app.py | 6 +- .../apps/multidimensional_annotation_app.py | 1017 ++++++++ ecodata/app/apps/nc_builder_app.py | 855 +++++++ .../app/apps/presence_data_preparation_app.py | 767 ++++++ ecodata/movebank_functions.py | 6 +- ecodata/multidim_annotation_func.py | 2228 +++++++++++++++++ ecodata/nc_builder_functions.py | 930 +++++++ ecodata/presence_functions.py | 934 +++++++ 19 files changed, 7807 insertions(+), 190 deletions(-) create mode 100644 docs/apps/user_guide/annotation_engine.md create mode 100644 docs/apps/user_guide/multidimensional_annotation.md create mode 100644 docs/apps/user_guide/nc_builder.md create mode 100644 docs/apps/user_guide/presence_data_preparation.md create mode 100644 ecodata/app/apps/multidimensional_annotation_app.py create mode 100644 ecodata/app/apps/nc_builder_app.py create mode 100644 ecodata/app/apps/presence_data_preparation_app.py create mode 100644 ecodata/multidim_annotation_func.py create mode 100644 ecodata/nc_builder_functions.py create mode 100644 ecodata/presence_functions.py diff --git a/docs/apps/user_guide/annotation_engine.md b/docs/apps/user_guide/annotation_engine.md new file mode 100644 index 0000000..e4a09be --- /dev/null +++ b/docs/apps/user_guide/annotation_engine.md @@ -0,0 +1,26 @@ +# Annotation Engine + +## App features + +With the Annotation Engine App, you can +- Annotate movement data with environmental variables from gridded environmental datasets. +- Load movement data and environmental data from supported formats such as NetCDF and GeoTIFF. +- Select environmental variables for annotation. +- Match movement records with environmental values by location and time. +- Use different annotation approaches for continuous variables and categorical or quality-control variables. +- Apply spatial and temporal matching or interpolation methods where supported. +- Optionally apply scale factor and offset corrections to continuous variables. +- Export annotated movement data for further analysis or visualization. + +## Using the app + +1. If you haven't already, prepare a local movement data file and the environmental datasets you want to use for annotation. +2. Launch the Annotation Engine App. +3. Select the movement data file. The file should contain location and time information compatible with the ECODATA movement data format. +4. Load the environmental dataset or datasets. Depending on the workflow, these may be NetCDF or GeoTIFF files. +5. Select the environmental variables that should be added to the movement records. +6. Specify whether selected variables should be treated as continuous variables or categorical / quality-control variables. +7. Select the annotation method and, if available, the spatial or temporal interpolation options. +8. If using continuous variables with scale factor or offset values, set these options before running the annotation. +9. Run the annotation process. +10. Review the status messages and save the annotated movement data file. \ No newline at end of file diff --git a/docs/apps/user_guide/index.md b/docs/apps/user_guide/index.md index a83be3f..c07f77f 100644 --- a/docs/apps/user_guide/index.md +++ b/docs/apps/user_guide/index.md @@ -14,5 +14,9 @@ tracks_explorer gridded_data_explorer subsetter movie_maker +annotation_engine +presence_data_preparation +nc_builder +multidimensional_annotation ``` \ No newline at end of file diff --git a/docs/apps/user_guide/multidimensional_annotation.md b/docs/apps/user_guide/multidimensional_annotation.md new file mode 100644 index 0000000..f85f4c1 --- /dev/null +++ b/docs/apps/user_guide/multidimensional_annotation.md @@ -0,0 +1,23 @@ +# Multidimensional Annotation + +## App features + +With the Multidimensional Annotation App, you can +- Annotate movement data with environmental variables from multidimensional NetCDF datasets. +- Work with environmental data that include time, latitude, longitude, and vertical dimensions such as pressure level, height, or model level. +- Load movement records and match them with selected environmental variables. +- Select coordinate variables and environmental variables from NetCDF files. +- Use multidimensional environmental datasets for advanced annotation workflows. +- Export movement data enriched with selected environmental values. + +## Using the app + +1. If you haven't already, prepare a local movement data file and the NetCDF environmental files you want to use for annotation. +2. Launch the Multidimensional Annotation App. +3. Select the movement data file. The file should contain location and time information compatible with the ECODATA movement data format. +4. Load the NetCDF environmental dataset or datasets. +5. Select the coordinate variables used by the environmental file, such as time, latitude, longitude, and vertical level. +6. Select the environmental variables that should be extracted for the movement records. +7. Check that the spatial, temporal, and vertical coverage of the environmental data matches the movement data. +8. Run the annotation process. +9. Review the status messages and save the annotated output file. \ No newline at end of file diff --git a/docs/apps/user_guide/nc_builder.md b/docs/apps/user_guide/nc_builder.md new file mode 100644 index 0000000..4684322 --- /dev/null +++ b/docs/apps/user_guide/nc_builder.md @@ -0,0 +1,25 @@ +# NC Builder + +## App features + +With the NC Builder App, you can +- Prepare and standardize NetCDF files for use in ECODATA annotation workflows. +- Load one or more NetCDF files from local folders. +- Inspect available variables, coordinates, dimensions, and time information. +- Select target variables and assign standard coordinate roles such as time, latitude, longitude, and vertical level. +- Combine files by time, by level, or by both time and level, depending on the structure of the source data. +- Optionally apply spatial and temporal subsetting. +- Export a standardized NetCDF file that can be used by ECODATA annotation apps. + +## Using the app + +1. If you haven't already, prepare the NetCDF files that need to be combined or standardized. +2. Launch the NC Builder App. +3. Select the input folder or input files containing the NetCDF data. +4. Choose the combine mode, such as combining by time, by vertical level, or by both time and level. +5. Inspect the detected variables and coordinates. +6. Select the target variable and assign the correct coordinate fields for time, latitude, longitude, and, if needed, vertical level. +7. If needed, set spatial or temporal subset options. +8. Specify the output file name and location. +9. Click the build button to create the standardized NetCDF file. +10. Review the status messages and check the output file before using it in annotation workflows. \ No newline at end of file diff --git a/docs/apps/user_guide/presence_data_preparation.md b/docs/apps/user_guide/presence_data_preparation.md new file mode 100644 index 0000000..70bfe45 --- /dev/null +++ b/docs/apps/user_guide/presence_data_preparation.md @@ -0,0 +1,24 @@ +# Presence Data Preparation + +## App features + +With the Presence Data Preparation App, you can +- Prepare species occurrence or presence-type observation data for further visualization and analysis in ECODATA. +- Load observation data and, when available, associated sampling or effort data. +- Filter records by selected temporal, spatial, taxonomic, and data-quality criteria. +- Aggregate observations by user-defined time intervals. +- Optionally aggregate observations to a regular spatial grid. +- Calculate basic presence and observation-effort metrics. +- Export prepared tables and tracks-style files for further use in ECODATA visualization workflows. + +## Using the app + +1. If you haven't already, prepare local copies of the observation data and, if needed, the associated sampling or effort file. +2. Launch the Presence Data Preparation App. +3. Under the input file options, select the observation data file. If the workflow requires sampling or effort information, also select the corresponding sampling file. +4. Select the required filtering options, such as date range, species, review status, and spatial limits. +5. If needed, enable spatial grid aggregation and set the grid step. +6. Select the time aggregation interval. The app groups observations into fixed time windows based on the selected number of days. +7. Click the processing button to run the aggregation. +8. Review the status messages and generated output information. +9. Save the output files. The app can create aggregated count data, aggregated presence data, presence points, and tracks-style files for further ECODATA visualization. \ No newline at end of file diff --git a/ecodata-dev-env.yml b/ecodata-dev-env.yml index 6066158..e946e68 100644 --- a/ecodata-dev-env.yml +++ b/ecodata-dev-env.yml @@ -9,6 +9,7 @@ dependencies: - conda-forge::jupyterlab - pytest - pip +- geographiclib - pip: - black - isort diff --git a/ecodata-env.yml b/ecodata-env.yml index dc895f5..f4519dc 100644 --- a/ecodata-env.yml +++ b/ecodata-env.yml @@ -26,3 +26,5 @@ dependencies: - fiona - gdown<4.6 # gdown 4.6.something has a problem with our gdrive files - distributed +- geographiclib +- h5netcdf \ No newline at end of file diff --git a/ecodata/__init__.py b/ecodata/__init__.py index dbb2c1b..c86eeb0 100644 --- a/ecodata/__init__.py +++ b/ecodata/__init__.py @@ -60,3 +60,14 @@ get_nc_bounds, safe_open_nc_with_time_decoding ) +from ecodata.multidim_annotation_func import( + sample_era5_at_height, +) + +from ecodata.presence_functions import ( + VettingOptions, + AggregationOptions, + aggregate_ebird_to_files, + export_tracks_from_aggregated_counts, + read_species_from_agg_counts, +) \ No newline at end of file diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index 9c2ac99..1e0fae5 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -1,6 +1,8 @@ import xarray as xr import geopandas as gpd from pathlib import Path +import gc +import time import pandas as pd import re from shapely.geometry import Point @@ -148,7 +150,12 @@ def load_taxa_and_ids_from_csv(file_path): def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, - out_csv_path=None): + out_csv_path=None, coord_spec=None, + continuous_vars=None, categorical_vars=None, + apply_value_correction: bool = False, + value_scale_factor: float = 1.0, + value_add_offset: float = 0.0, + value_correction_vars=None): """ env_var_map: dict[str, str] — variable → file path selected_env_vars: list[str] — selected variables @@ -181,12 +188,51 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele # === Step 2: Loading and interpolation of environmental data === result = load_selected_environmental_data(df_filtered, env_var_map, selected_env_vars, movebank_path, - interpolation_method, smoothing_k=smoothing_k) + interpolation_method, smoothing_k=smoothing_k, + coord_spec=coord_spec, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars) if result is None: print("[ERROR] Environmental data was not loaded.") return - df_annotated, nc_start, nc_end = result + df_annotated, ann_nc_start, ann_nc_end = result + # Optional post-sampling value correction + # Apply only to continuous variables after sampling/interpolation. + # This is methodologically safe for linear scale/offset: + # physical_value = raw_value * scale_factor + add_offset. + # Categorical/QC variables must remain as raw category/flag codes. + if apply_value_correction: + if value_correction_vars is None: + correction_vars = list(continuous_vars or []) + else: + correction_vars = list(value_correction_vars or []) + + try: + scale = float(value_scale_factor) + offset = float(value_add_offset) + except Exception as e: + raise ValueError(f"Invalid scale factor / offset: {e}") + + for v in correction_vars: + if v not in df_annotated.columns: + print(f"[WARNING] Scale/offset skipped for '{v}': column not found.") + continue + + # Convert only the annotated continuous column. + # Non-numeric values become NaN, which is acceptable for continuous variables. + df_annotated[v] = pd.to_numeric(df_annotated[v], errors="coerce") * scale + offset + + print( + "[INFO] Applied post-sampling scale/offset to continuous variables: " + f"{correction_vars}; scale={scale}, offset={offset}" + ) + # Keep the real union NC range computed before annotation, + # unless an annotator explicitly returns a valid range in the future. + if not pd.isna(ann_nc_start): + nc_start = ann_nc_start + if not pd.isna(ann_nc_end): + nc_end = ann_nc_end #### diagnostic var = selected_env_vars[0] if selected_env_vars else None @@ -236,8 +282,13 @@ def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=Non print("[DEBUG] Filtering is started") df = pd.read_csv(movebank_path) df.columns = [re.sub(r"[-:.\s]+", "_", col.lower()) for col in df.columns] - if "location_long" in df.columns and "location_lon" not in df.columns: - df["location_lon"] = df["location_long"] + # --- unify longitude column to location_lon --- + if "location_lon" in df.columns and "location_long" in df.columns: + # both exist -> keep location_lon (canonical), drop location_long + df = df.drop(columns=["location_long"]) + elif "location_lon" not in df.columns and "location_long" in df.columns: + # only location_long -> rename to canonical location_lon + df = df.rename(columns={"location_long": "location_lon"}) if "timestamp" not in df.columns and "eobs_start_timestamp" in df.columns: df["timestamp"] = df["eobs_start_timestamp"] @@ -342,12 +393,22 @@ def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: def load_selected_environmental_data(df, env_var_map, selected_vars, - movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2): + movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2, + coord_spec=None, + continuous_vars=None, categorical_vars=None): """ Wrapper that calls the appropriate annotation function depending on the interpolation method. - Supports: - - "Nearest neighbour (time-linear)" - - "IDW (time-linear)" + + Current behaviour: + - Continuous + Nearest neighbour: + nearest spatial grid node + linear temporal interpolation + - Continuous + IDW: + k nearest spatial grid nodes + linear temporal interpolation per node + IDW + - Categorical/QC + Nearest neighbour: + nearest spatial grid node + nearest timestep + - Categorical/QC + IDW selected: + categorical/QC variables are not IDW-averaged; + they use nearest spatial grid node + nearest timestep """ label = (interpolation_method or "").strip().lower() label = label.replace("neighbor", "neighbour") # Normalise US/UK spelling @@ -355,19 +416,126 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, is_nearest = label.startswith("nearest") is_idw = ("idw" in label) or ("inverse distance" in label) - if is_nearest: - return annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) - elif is_idw: - return annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k) - else: + # Normalize interpolation method + method = (interpolation_method or "").lower() + is_nearest = ("nearest" in method) + is_idw = ("idw" in method) + + # If split lists are not provided, treat everything as "selected_vars" + cont = list(continuous_vars or []) + cat = list(categorical_vars or []) + + if not cont and not cat: + # everything in selected_vars, method applies to all + if is_nearest: + return annotate_env_nearest( + df, env_var_map, selected_vars, movebank_path, + smoothing_k=smoothing_k, coord_spec=coord_spec + ) + if is_idw: + return annotate_env_IDW( + df, env_var_map, selected_vars, movebank_path, + smoothing_k=smoothing_k, coord_spec=coord_spec + ) raise ValueError(f"Unknown interpolation method: {interpolation_method}") + + # If split lists are provided: + # 1) Nearest selected: + # continuous -> nearest grid node + linear time interpolation + # categorical/QC -> nearest grid node + nearest timestep + if is_nearest: + out_df = df + nc_start = pd.NaT + nc_end = pd.NaT + + # Continuous: nearest grid node + linear time interpolation + if cont: + out_df, nc_start, nc_end = annotate_env_nearest( + out_df, env_var_map, cont, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + temporal_method="linear" + ) + + # Categorical/QC: nearest grid node + nearest timestep + if cat: + out_df, nc_start2, nc_end2 = annotate_env_nearest( + out_df, env_var_map, cat, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + temporal_method="nearest" + ) + + if pd.isna(nc_start) and not pd.isna(nc_start2): + nc_start = nc_start2 + if pd.isna(nc_end) and not pd.isna(nc_end2): + nc_end = nc_end2 + + return out_df, nc_start, nc_end + + # 2) IDW selected -> cont=IDW, cat=NN + if is_idw: + out_df = df + nc_start = pd.NaT + nc_end = pd.NaT + + # continuous via IDW + if cont: + out_df, nc_start, nc_end = annotate_env_IDW( + out_df, env_var_map, cont, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + temporal_method="linear" + ) + + # categorical via Nearest neighbour in space + nearest timestep in time + if cat: + out_df, nc_start2, nc_end2 = annotate_env_nearest( + out_df, env_var_map, cat, movebank_path, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + temporal_method="nearest" + ) + # keep nc_start/nc_end stable (both annotators return NaT) + if pd.isna(nc_start) and not pd.isna(nc_start2): + nc_start = nc_start2 + if pd.isna(nc_end) and not pd.isna(nc_end2): + nc_end = nc_end2 + + return out_df, nc_start, nc_end + + raise ValueError(f"Unknown interpolation method: {interpolation_method}") + + +def standardize_time_lat_lon(ds, coord_spec): + mapping = {} + if coord_spec: + for std in ("time", "lat", "lon"): + chosen = coord_spec.get(std) + if chosen and chosen in ds.variables and chosen != std: + mapping[chosen] = std + + if mapping: + ds = ds.rename(mapping) + + for req in ("time", "lat", "lon"): + if req not in ds.variables: + raise ValueError( + f"Missing required '{req}' variable after user selection. " + f"Selected: {coord_spec}. Available: {list(ds.variables.keys())}" + ) + return ds + -def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4): +def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4, + coord_spec=None, temporal_method: str = "linear"): """ Annotate movement points with environmental values using: - - Spatial: nearest grid node - - Temporal: vectorised linear interpolation in time (per grid cell) + - Spatial: nearest grid node + - Temporal: + * "linear" -> vectorised linear interpolation in time, for continuous variables + * "nearest" -> nearest available timestep, for categorical/QC variables This version supports "expanded" variable labels that include a pressure/vertical level, e.g. "v_1000", "v_975", ... For such labels, the base variable ("v") is taken from the @@ -413,18 +581,25 @@ def _nearest_indices_vectorized(arr, vals): out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + temporal_method = (temporal_method or "linear").strip().lower() + if temporal_method not in ("linear", "nearest"): + temporal_method = "linear" + # Placeholders for nearest grid coords (one set; overwritten by last variable) nc_latitudes = np.full(len(out), np.nan, dtype="float64") nc_longitudes = np.full(len(out), np.nan, dtype="float64") - # Target times for np.interp (int64 ns) + # Target times as int64 ns, used for either np.interp or nearest-time lookup. tgt_times = out["timestamp"].to_numpy("datetime64[ns]").astype("int64") # --- main loop over requested labels ------------------------------------- for label in selected_vars: file_path = env_var_map.get(label) - out[label] = np.nan # ensure column exists even on failures - + if temporal_method == "nearest": + # Categorical/QC-safe column: preserve integer codes or labels if present. + out[label] = pd.Series([pd.NA] * len(out), index=out.index, dtype="object") + else: + out[label] = np.nan # continuous numeric column if not file_path or not Path(file_path).is_file(): print(f"[WARNING] File for {label} not found: {file_path}") continue @@ -434,6 +609,7 @@ def _nearest_indices_vectorized(arr, vals): try: ds = safe_open_nc_with_time_decoding(file_path) + ds = standardize_time_lat_lon(ds, coord_spec) if base_var not in ds: print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") ds.close() @@ -491,7 +667,8 @@ def _nearest_indices_vectorized(arr, vals): cell_code = (lat_idx.astype(np.int64) * len(glon)) + lon_idx.astype(np.int64) unique_cells, inverse = np.unique(cell_code, return_inverse=True) - # Cache of per-cell series: (ii, jj) -> 1D float64 array over time + # Cache of per-cell series: (ii, jj) -> 1D array over time. + # For continuous variables this is float64; for categorical/QC variables the original dtype is preserved. series_cache: dict[tuple[int, int], np.ndarray] = {} col_idx = out.columns.get_loc(label) @@ -504,24 +681,77 @@ def _nearest_indices_vectorized(arr, vals): key = (ii, jj) if key not in series_cache: - # Read the cell time series once; cast to float64 for np.interp - series_cache[key] = da.isel({lat_dim: ii, lon_dim: jj}).values.astype("float64") + raw_series = da.isel({lat_dim: ii, lon_dim: jj}).values + + if temporal_method == "nearest": + # Keep original dtype for categorical/QC variables. + # This avoids converting category codes to float and also supports non-numeric labels. + series_cache[key] = raw_series + else: + # Continuous variables: cast to float64 for np.interp. + series_cache[key] = raw_series.astype("float64") + y = series_cache[key] - # Valid-only mask for temporal interpolation - m = np.isfinite(y) - if m.sum() < 2: - out.iloc[pos, col_idx] = np.nan - continue + if temporal_method == "nearest": + # Categorical/QC-safe temporal sampling: + # take the value from the nearest available timestep, no interpolation. + m = pd.notna(y) + if m.sum() < 1: + out.iloc[pos, col_idx] = np.nan + continue + + x = gtime[m] # source times, int64 ns + yy = y[m] # source values, may be integer/category codes + + # Ensure time is sorted + order = np.argsort(x) + x = x[order] + yy = yy[order] - x = gtime[m] # source times (int64) - yy = y[m] # source values + idx = np.searchsorted(x, xi) + right = np.clip(idx, 0, len(x) - 1) + left = np.clip(idx - 1, 0, len(x) - 1) - vals = np.interp(xi, x, yy) - # Outside native time range → NaN (np.interp would extend) - vals[(xi < x.min()) | (xi > x.max())] = np.nan + use_left = ( + (idx > 0) + & ( + (idx == len(x)) + | (np.abs(xi - x[left]) <= np.abs(x[right] - xi)) + ) + ) - out.iloc[pos, col_idx] = vals + nearest_idx = np.where(use_left, left, right) + vals = yy[nearest_idx] + + # Keep existing "no extrapolation" behaviour: + # points outside the native NC time range remain NaN. + vals = vals.astype("object") + vals[(xi < x.min()) | (xi > x.max())] = np.nan + + out.iloc[pos, col_idx] = vals + + else: + # Continuous variables: existing linear temporal interpolation. + y_float = y.astype("float64") + m = np.isfinite(y_float) + if m.sum() < 2: + out.iloc[pos, col_idx] = np.nan + continue + + x = gtime[m] + yy = y_float[m] + + order = np.argsort(x) + x = x[order] + yy = yy[order] + + vals = np.interp(xi, x, yy) + + # Outside native time range → NaN + vals[(xi < x.min()) | (xi > x.max())] = np.nan + + out.iloc[pos, col_idx] = vals ds.close() @@ -538,11 +768,19 @@ def _nearest_indices_vectorized(arr, vals): return out, pd.NaT, pd.NaT -def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2): +def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2, + coord_spec=None, temporal_method: str = "linear"): """ Annotate movement points with environmental values using: - - Spatial: Inverse Distance Weighting (IDW) over k nearest grid nodes - - Temporal: 1D linear interpolation in time (per grid node), vectorised via np.interp + - Spatial: Inverse Distance Weighting (IDW) over k nearest grid nodes + - Temporal: + * "linear" -> 1D linear interpolation in time per grid node + * "nearest" -> nearest available timestep per grid node + + Important: + IDW is suitable for continuous numeric variables. Even with temporal_method="nearest", + spatial IDW still averages values across neighbouring grid nodes, so it is not + recommended for true categorical/QC variables. This version understands expanded variable labels that include a pressure/vertical level, e.g. "v_1000", "v_975". It will: @@ -575,6 +813,10 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") out = out.dropna(subset=["timestamp", "location_lat", "location_lon"]) + temporal_method = (temporal_method or "linear").strip().lower() + if temporal_method not in ("linear", "nearest"): + temporal_method = "linear" + # Keep nc_lat/nc_lon semantics consistent with prior implementation (copy of point coords) out["nc_lat"] = out["location_lat"].values out["nc_lon"] = out["location_lon"].values @@ -587,7 +829,12 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: # --- main loop over labels ----------------------------------------------------- for label in selected_vars: file_path = env_var_map.get(label) - out[label] = np.nan # ensure the column exists even if we skip/err + if temporal_method == "nearest": + # Nearest-time mode: preserve raw values before spatial handling. + # Note: spatial IDW is still numeric and is not recommended for true categorical/QC variables. + out[label] = pd.Series([pd.NA] * len(out), index=out.index, dtype="object") + else: + out[label] = np.nan # continuous numeric column if not file_path or not Path(file_path).is_file(): print(f"[WARNING] File for {label} not found: {file_path}") @@ -598,6 +845,7 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: try: ds = safe_open_nc_with_time_decoding(file_path) + ds = standardize_time_lat_lon(ds, coord_spec) if base_var not in ds: print(f"[WARNING] Base variable '{base_var}' not in {file_path}") ds.close() @@ -642,7 +890,8 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: gtime_int = pd.to_datetime(ds["time"].values).to_numpy("datetime64[ns]").astype("int64") # Cache per-grid-node time series (to avoid repeated reads for neighbors) - # key: (ii, jj) -> (x_int64_valid, y_float64_valid) + # key: (ii, jj) -> (x_int64_valid, y_valid) + # For linear mode y_valid is float64; for nearest-time mode original dtype is preserved. series_cache: dict[tuple[int, int], tuple[np.ndarray, np.ndarray]] = {} col_idx = out.columns.get_loc(label) @@ -663,26 +912,77 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: for j, (ii, jj) in enumerate(nn_idx): key = (ii, jj) if key not in series_cache: - # Read cell time series once; keep only valid points for interp - y = da.isel({lat_dim: ii, lon_dim: jj}).values.astype("float64") - m = np.isfinite(y) - if m.sum() >= 2: - x = gtime_int[m] - yy = y[m] + raw_y = da.isel({lat_dim: ii, lon_dim: jj}).values + + if temporal_method == "nearest": + # Keep original values for nearest-time lookup. + m = pd.notna(raw_y) + if m.sum() >= 1: + x = gtime_int[m] + yy = raw_y[m] + + order = np.argsort(x) + x = x[order] + yy = yy[order] + else: + x = np.empty(0, dtype="int64") + yy = np.empty(0, dtype=raw_y.dtype) + else: - x = np.empty(0, dtype="int64") - yy = np.empty(0, dtype="float64") + # Linear interpolation requires numeric float values. + y = raw_y.astype("float64") + m = np.isfinite(y) + if m.sum() >= 2: + x = gtime_int[m] + yy = y[m] + + order = np.argsort(x) + x = x[order] + yy = yy[order] + else: + x = np.empty(0, dtype="int64") + yy = np.empty(0, dtype="float64") + series_cache[key] = (x, yy) x, yy = series_cache[key] - if x.size < 2: - vals[j] = np.nan + if temporal_method == "nearest": + if x.size < 1: + vals[j] = np.nan + else: + idx = np.searchsorted(x, t_i) + + right = np.clip(idx, 0, len(x) - 1) + left = np.clip(idx - 1, 0, len(x) - 1) + + use_left = ( + (idx > 0) + and ( + (idx == len(x)) + or (abs(t_i - x[left]) <= abs(x[right] - t_i)) + ) + ) + + nearest_idx = left if use_left else right + v = yy[nearest_idx] + + # Keep no-extrapolation behaviour. + if (t_i < x.min()) or (t_i > x.max()): + v = np.nan + + vals[j] = v + else: - v = np.interp(t_i, x, yy) - # clamp to NaN if extrapolated - if (t_i < x.min()) or (t_i > x.max()): - v = np.nan - vals[j] = v + if x.size < 2: + vals[j] = np.nan + else: + v = np.interp(t_i, x, yy) + + # Keep no-extrapolation behaviour. + if (t_i < x.min()) or (t_i > x.max()): + v = np.nan + + vals[j] = v # Planar Euclidean distance in degrees (consistent with prior code) dists[j] = np.hypot(glat[ii] - xlat, glon[jj] - xlon) @@ -699,6 +999,34 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] return out, pd.NaT, pd.NaT +def _safe_remove_existing_file(path, retries: int = 5, delay: float = 0.5): + """ + Remove an existing file before overwriting it. + + This is mainly needed on Windows, where NetCDF files can remain locked + for a short time after being opened by xarray/netCDF4/h5netcdf. + """ + path = Path(path) + + if not path.exists(): + return + + last_error = None + + for _ in range(retries): + try: + gc.collect() + path.unlink() + return + except PermissionError as e: + last_error = e + time.sleep(delay) + + raise PermissionError( + f"Could not remove existing file because it is still locked: {path}. " + f"Close any open dataset/viewer using this file and try again. " + f"Original error: {last_error}" + ) def convert_tif_to_nc_before_annotation(tif_paths, output_dir): """ @@ -735,19 +1063,15 @@ def convert_tif_to_nc_before_annotation(tif_paths, output_dir): if nodata is not None: arr = np.where(arr == nodata, np.nan, arr) - # Read scale_factor from tags (if present); otherwise use a 0.0001 heuristic for int16 NDVI/EVI - scale = None - try: - tags = src.tags() - for k in ("scale_factor", "SCALE", "Scale", "scale"): - if k in tags: - scale = float(tags[k]); break - except Exception: - pass - if scale is None and (np.nanmin(arr) >= -10000) and (np.nanmax(arr) <= 10000): - scale = 0.0001 - if scale is not None: - arr = arr * scale + # IMPORTANT: + # Do not apply scale_factor / add_offset during TIF -> NetCDF conversion. + # The NetCDF stores raw raster values. + # + # Optional scale/offset correction is applied later after sampling, + # and only to user-selected continuous variables. + # + # This avoids corrupting categorical/QC layers such as masks, flags, + # land-cover classes, or quality codes. planes.append(arr) @@ -773,7 +1097,16 @@ def convert_tif_to_nc_before_annotation(tif_paths, output_dir): base = Path(tif_paths[0]).name.split("_")[0] safe_base = re.sub(r"[^\w\-]", "_", base) out = Path(output_dir) / f"{safe_base}_nc_output.nc" - ds.to_netcdf(out) + _safe_remove_existing_file(out) + + try: + ds.to_netcdf(out) + finally: + try: + ds.close() + except Exception: + pass + return str(out) diff --git a/ecodata/app/apps/__init__.py b/ecodata/app/apps/__init__.py index 6eff0d3..b3cb357 100644 --- a/ecodata/app/apps/__init__.py +++ b/ecodata/app/apps/__init__.py @@ -3,4 +3,8 @@ import ecodata.app.apps.subsetter_app # noqa import ecodata.app.apps.tracks_explorer_app # noqa import ecodata.app.apps.annotation_engine_app # noqa +import ecodata.app.apps.presence_data_preparation_app # noqa +import ecodata.app.apps.nc_builder_app # noqa +#import ecodata.app.apps.height_sampler_app +import ecodata.app.apps.multidimensional_annotation_app # noqa from ecodata.panel_utils import applications # noqa diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index eb1ff29..6dfb476 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -19,10 +19,10 @@ class movebank_annotation_engine(param.Parameterized): local_ID_file = param_widget(FileSelector(constrain_path=False, expanded=True, size=10)) load_data_button = param_widget(pn.widgets.Button(name="Load data", button_type="primary")) taxon_name_val = param_widget( - pn.widgets.MultiSelect(name="Taxon name (press Ctrl for multiple selection)", options=[], height = 140, disabled=True) + pn.widgets.MultiSelect(name="Taxon name (use Ctrl or ⌘ for multiple selection)", options=[], height = 140, disabled=True) ) individual_ID = param_widget( - pn.widgets.MultiSelect(name="Individual ID (press Ctrl for multiple selection)", options=[], height = 140, disabled=True) + pn.widgets.MultiSelect(name="Individual ID (use Ctrl or ⌘ for multiple selection)", options=[], height = 140, disabled=True) ) simple_interp_button = param_widget(pn.widgets.Button(name="Simple interpolation (missing ≤ 1 day)", button_type="primary")) deployment_time_gap = param_widget( @@ -68,9 +68,21 @@ class movebank_annotation_engine(param.Parameterized): load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary") load_bound_button = pn.widgets.Button(name="Load boundary data", button_type="primary") reset_bound_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary") - env_data_multiselect = pn.widgets.MultiSelect(name="Environmental variables (use Ctrl for multiple)", options=[], height = 140 ) - taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl for multiple)", height = 140) - id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl for multiple)", height = 140) + nc_time_var = pn.widgets.Select(name="Time variable", options=[], value=None) + nc_lat_var = pn.widgets.Select(name="Latitude variable", options=[], value=None) + nc_lon_var = pn.widgets.Select(name="Longitude variable", options=[], value=None) + env_continuous_selector = pn.widgets.MultiSelect( + name="Continuous (use Ctrl or ⌘ for multiple selection)", + options=[], value=[], height=180 + ) + + env_categorical_selector = pn.widgets.MultiSelect( + name="Categorical (use Ctrl or ⌘ for multiple selection)", + options=[], value=[], height=180 + ) + + taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl or ⌘ for multiple)", height = 140) + id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl or ⌘ for multiple)", height = 140) env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", sizing_mode="stretch_width") movement_info = pn.pane.HTML("File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", @@ -118,9 +130,14 @@ class movebank_annotation_engine(param.Parameterized): options=["2", "4", "6", "8"], value="4" ) - tif_env_data_multiselect = pn.widgets.MultiSelect(name="netCDF Environmental variables", options=[], height = 140) - tif_taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon", height = 140) - tif_id_multiselect = pn.widgets.MultiSelect(name="Select ID", height = 140) + tif_env_data_multiselect = pn.widgets.MultiSelect(name="Environmental variables (use Ctrl or ⌘ for multiple)", options=[], height = 140) + # TIF variable type: continuous vs categorical + tif_continuous_vars = pn.widgets.MultiSelect(name="Continuous variables (use Ctrl or ⌘ for multiple)", options=[], value=[], size=8) + tif_categorical_vars = pn.widgets.MultiSelect(name="Categorical/QC variables (use Ctrl or ⌘ for multiple)", options=[], value=[], size=8) + # prevent recursive watcher updates + _syncing_tif_var_types = False + tif_taxon_multiselect = pn.widgets.MultiSelect(name="Select Taxon (use Ctrl or ⌘ for multiple)", height = 140) + tif_id_multiselect = pn.widgets.MultiSelect(name="Select ID (use Ctrl or ⌘ for multiple)", height = 140) tif_env_info = pn.pane.HTML("File: not selected
Environment parameters: -
Time range: -
Spatial range: -
", sizing_mode="stretch_width") tif_movement_info = pn.pane.HTML("File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", @@ -130,6 +147,14 @@ class movebank_annotation_engine(param.Parameterized): "Boundary file: not selected
Spatial range: = environment data boundary", sizing_mode="stretch_width" ) + # --- TIF scaling (optional) --- + tif_apply_scale = pn.widgets.Checkbox(name="Apply scale factor / offset", value=False) + tif_scale_factor = pn.widgets.FloatInput( + name="Scale factor", value=1.0, step=0.0001, start=None, disabled=True + ) + tif_add_offset = pn.widgets.FloatInput( + name="Add offset", value=0.0, step=0.1, start=None, disabled=True + ) tif_interpolation_method = pn.widgets.Select( name="Interpolation method (spatial)", @@ -144,6 +169,8 @@ def __init__(self, **params): self.interpolation_method.name = "Spatial interpolation method (.nc)" self.tif_interpolation_method.name = "Spatial interpolation method (.tif)" + self._wire_env_split_guards() + self._apply_env_selector_labels() rename_param_widgets( self, [ @@ -160,20 +187,23 @@ def __init__(self, **params): "env_data_selector", "bound_data_selector", "movement_data_selector", "load_env_button", "load_bound_button", "reset_bound_button", - "load_movement_button", "env_data_multiselect", + "load_movement_button", "env_continuous_selector", "env_categorical_selector", "taxon_multiselect", "id_multiselect", "boundary_info_str", "interpolation_method", "control_smoothing", "env_info", "movement_info" ,"output_path", "make_annotation_button", + "nc_time_var", "nc_lat_var","nc_lon_var", # === TIF Annotation tab === "tif_env_data_selector", "tif_movement_data_selector", - "tif_bound_data_selector","tif_reset_bound_button", + "tif_bound_data_selector", "tif_reset_bound_button", "tif_env_data_multiselect", + "tif_continuous_vars", "tif_categorical_vars", "tif_taxon_multiselect", "tif_id_multiselect", "tif_interpolation_method", "tif_control_smoothing", + "tif_apply_scale", "tif_scale_factor", "tif_add_offset", "tif_env_info", "tif_movement_info", "tif_make_annotation_button" ] @@ -184,32 +214,36 @@ def __init__(self, **params): NC_H = 1080 # === NC tab === self._nc_col1 = self._section( - "Environmental data (.nc)", + "1. Environmental data (.nc)", pn.Column(self.env_data_selector, sizing_mode="stretch_width"), self.load_env_button, - self.env_data_multiselect, + self.env_continuous_selector, + self.env_categorical_selector, self.env_info, + self.nc_time_var, self.nc_lat_var, self.nc_lon_var, self.interpolation_method, self.control_smoothing, self.output_path, - self.make_annotation_button, - height=NC_H, + height=NC_H + 400, ) self._nc_col2 = self._section( - "Movebank data (.csv)", + "2. Movebank data (.csv)", pn.Column(self.movement_data_selector, sizing_mode="stretch_width"), self.load_movement_button, self.taxon_multiselect, + self.id_multiselect, self.movement_info, - height=NC_H, + height=NC_H + 400, ) self._nc_col3 = self._section( - "Boundary data (.shp/.geojson)", + "3. Boundary data (.shp/.geojson)", pn.Column(self.bound_data_selector, sizing_mode="stretch_width"), pn.Row(self.load_bound_button, self.reset_bound_button), - self.id_multiselect, self.boundary_info_str, - height=NC_H, + pn.layout.Divider(), + pn.pane.Markdown("### 4. Start annotation"), + self.make_annotation_button, + height=NC_H + 400, ) # synchronize heights after rendering @@ -220,39 +254,50 @@ def __init__(self, **params): pn.GridBox( self._nc_col1, self._nc_col2, self._nc_col3, ncols=3, sizing_mode="stretch_width", + height=1400, + scroll=True, ), ) # TIF - TIF_H = 1080 + TIF_H = 1500 self._tif_col1 = self._section( - "Environmental data (.tif) - select one (of)", + "1. Environmental data (.tif) - select one (of)", pn.Column(self.tif_env_data_selector, sizing_mode="stretch_width"), self.tif_load_env_button, - self.tif_env_data_multiselect, + self.tif_continuous_vars, + self.tif_categorical_vars, + + pn.layout.Divider(), self.tif_env_info, self.tif_interpolation_method, self.tif_control_smoothing, self.tif_output_path, - self.tif_make_annotation_button, + pn.pane.Markdown("### Post-sampling correction for continuous variables"), + self.tif_apply_scale, + self.tif_scale_factor, + self.tif_add_offset, height=TIF_H, ) self._tif_col2 = self._section( - "Movebank data (.csv)", + "2. Movebank data (.csv)", pn.Column(self.tif_movement_data_selector, sizing_mode="stretch_width"), self.tif_load_movement_button, self.tif_taxon_multiselect, + self.tif_id_multiselect, self.tif_movement_info, height=TIF_H, ) self._tif_col3 = self._section( - "Boundary data (.shp/.geojson)", + "3. Boundary data (.shp/.geojson)", pn.Column(self.tif_bound_data_selector, sizing_mode="stretch_width"), pn.Row(self.tif_load_bound_button, self.tif_reset_bound_button), - self.tif_id_multiselect, self.tif_boundary_info_str, + pn.layout.Divider(), + pn.pane.Markdown("### 4. Start annotation"), + self.tif_make_annotation_button, height=TIF_H, ) @@ -310,7 +355,8 @@ def __init__(self, **params): self.load_movement_button.on_click(self.load_movement_data) self.taxon_multiselect.param.watch(self.update_annotation_ids_by_taxon, 'value') self.make_annotation_button.on_click(self.run_annotation) - self.env_data_multiselect.param.watch(lambda e: self.update_env_info_text(e.new), "value") + self.env_continuous_selector.param.watch(lambda e: self.update_env_info_text(self._get_selected_env_vars()), "value") + self.env_categorical_selector.param.watch(lambda e: self.update_env_info_text(self._get_selected_env_vars()), "value") self.taxon_multiselect.param.watch(lambda e: self.update_movement_info_text("Taxons", e.new), "value") self.id_multiselect.param.watch(lambda e: self.update_movement_info_text("IDs", e.new), "value") self.interpolation_method.param.watch(self._update_smoothing_options, 'value') @@ -321,10 +367,31 @@ def __init__(self, **params): self.tif_load_movement_button.on_click(self.load_movement_data_tif) self.tif_make_annotation_button.on_click(self.run_annotation_tif) self.tif_taxon_multiselect.param.watch(self.update_annotation_ids_by_taxon_tif, 'value') - self.tif_env_data_multiselect.param.watch(lambda e: self.update_env_info_text_tif(e.new), "value") + self.tif_continuous_vars.param.watch( + lambda e: self.update_env_info_text_tif( + list(self.tif_continuous_vars.value or []) + [ + v for v in list(self.tif_categorical_vars.value or []) + if v not in list(self.tif_continuous_vars.value or []) + ] + ), + "value" + ) + self.tif_categorical_vars.param.watch( + lambda e: self.update_env_info_text_tif( + list(self.tif_continuous_vars.value or []) + [ + v for v in list(self.tif_categorical_vars.value or []) + if v not in list(self.tif_continuous_vars.value or []) + ] + ), + "value" + ) self.tif_taxon_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("Taxons", e.new), "value") self.tif_id_multiselect.param.watch(lambda e: self.update_movement_info_text_tif("IDs", e.new), "value") self.tif_interpolation_method.param.watch(self._update_smoothing_options_tif, 'value') + self.tif_apply_scale.param.watch(self._update_tif_scale_widgets, "value") + self._update_tif_scale_widgets() + self.tif_continuous_vars.param.watch(self._sync_tif_variable_type_selection, "value") + self.tif_categorical_vars.param.watch(self._sync_tif_variable_type_selection, "value") @try_catch("Error loading Individual IDs") @@ -476,6 +543,96 @@ def run_merge_files(self, *events): self.alert.object = self.status_text + def _is_categorical_var(self, var_name: str, da) -> bool: + """ + Heuristic classification: + - QC/flag/mask/class/category in name -> categorical + - integer dtype + flag_values/flag_meanings attrs -> categorical + - integer dtype + small number of unique values (sample) -> categorical + """ + name = (var_name or "").lower() + name_hits = ["qc", "quality", "flag", "mask", "class", "category", "type", "landcover", "biome"] + if any(h in name for h in name_hits): + return True + + try: + import numpy as np + if np.issubdtype(da.dtype, np.integer): + attrs = getattr(da, "attrs", {}) or {} + if ("flag_values" in attrs) or ("flag_meanings" in attrs): + return True + + # sample uniqueness (avoid loading whole array) + # take first time slice if possible + sample = da + for dim in da.dims: + if dim.lower() in ("time",): + sample = sample.isel({dim: 0}) + break + vals = sample.values + flat = vals.ravel() + flat = flat[:5000] # cap + flat = flat[~np.isnan(flat)] if flat.dtype.kind == "f" else flat + uniq = np.unique(flat) + if len(uniq) <= 32: + return True + except Exception: + pass + + return False + + def _enforce_env_split_unique(self, changed: str, new_values: list): + """ + Ensure the same variable cannot be selected in both selectors. + changed: "cont" or "cat" + """ + cont = list(self.env_continuous_selector.value or []) + cat = list(self.env_categorical_selector.value or []) + + if changed == "cont": + # remove from categorical... + overlap = set(new_values) & set(cat) + if overlap: + self.env_categorical_selector.value = [v for v in cat if v not in overlap] + + elif changed == "cat": + overlap = set(new_values) & set(cont) + if overlap: + self.env_continuous_selector.value = [v for v in cont if v not in overlap] + + + def _wire_env_split_guards(self): + """ + Attach watchers for mutual exclusivity. + Call once in __init__. + """ + self.env_continuous_selector.param.watch( + lambda e: self._enforce_env_split_unique("cont", list(e.new or [])), + "value" + ) + self.env_categorical_selector.param.watch( + lambda e: self._enforce_env_split_unique("cat", list(e.new or [])), + "value" + ) + + + def _normalize_interp_key(self, ui_value: str) -> str: + """ + Convert UI label -> internal key expected by annotation engine. + Returns 'nearest' or 'idw' (fallback: original string). + """ + s = (ui_value or "").strip().lower() + if s.startswith("nearest"): + return "nearest" + if s.startswith("inverse") or "idw" in s: + return "idw" + return ui_value # fallback + + def _apply_env_selector_labels(self): + """Make selector purposes obvious in UI.""" + self.env_continuous_selector.name = "Continuous (use Ctrl or ⌘ for multiple)" + self.env_categorical_selector.name = "Categorical/QC (use Ctrl or ⌘ for multiple)" + @try_catch("Error loading environmental data") def load_env_data(self, *events): @@ -508,14 +665,70 @@ def load_env_data(self, *events): self._update_info_lines(self.env_info, {"File:": Path(nc_path).name}) self._auto_height(self.env_info) + #################### var_file_map: dict[str, str] = {} time_text = "-" spatial_text = "-" - # Auxiliary coordinate name candidates - time_candidates = ("time","Time","datetime","date","valid_time","forecast_time","verification_time") - lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x", "long") + # Coordinate name candidates + time_candidates = ["time", "Time", "datetime", "date", "valid_time"] + lat_candidates = ["lat", "latitude", "Latitude", "y"] + lon_candidates = ["lon", "longitude", "Longitude", "x"] + + try: + ds = safe_open_nc_with_time_decoding(nc_path) + + all_vars = sorted(list(ds.variables.keys())) + + # Populate dropdowns + self.nc_time_var.options = all_vars + self.nc_lat_var.options = all_vars + self.nc_lon_var.options = all_vars + + def pick_first(candidates): + for c in candidates: + if c in all_vars: + return c + return None + + # Preselect defaults (only if user hasn't selected yet) + if not self.nc_time_var.value: + self.nc_time_var.value = pick_first(time_candidates) + if not self.nc_lat_var.value: + self.nc_lat_var.value = pick_first(lat_candidates) + if not self.nc_lon_var.value: + self.nc_lon_var.value = pick_first(lon_candidates) + + # -------- TIME INFO -------- + time_name = self.nc_time_var.value + if time_name and time_name in ds: + tvals = pd.to_datetime(ds[time_name].values) + time_text = f"{tvals.min().date()} — {tvals.max().date()}" + + # ------ SPATIAL INFO ------- + lat_name = self.nc_lat_var.value + lon_name = self.nc_lon_var.value + if lat_name in ds and lon_name in ds: + lat_min = float(ds[lat_name].min()) + lat_max = float(ds[lat_name].max()) + lon_min = float(ds[lon_name].min()) + lon_max = float(ds[lon_name].max()) + spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" + + finally: + ds.close() + + + def _pick(cands): + for c in cands: + if c in all_vars: + return c + return None + + # defalts + self.nc_time_var.value = _pick(["time","Time","datetime","date","valid_time","forecast_time","verification_time"]) + self.nc_lat_var.value = _pick(["lat","latitude","y"]) + self.nc_lon_var.value = _pick(["lon","longitude","x","long"]) try: ds = safe_open_nc_with_time_decoding(nc_path) @@ -537,25 +750,23 @@ def load_env_data(self, *events): lon_max = float(ds[lon_name].max()) spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" - # ---- Перелік змінних з підтримкою вертикальних рівнів ---- + # List of variables with support for vertical levels LEVEL_DIM_CANDIDATES = ("isobaricInhPa", "isobaric_in_hPa", "level", "lev", "plev", "pressure", "pressure_level") for var in ds.data_vars: da = ds[var] if da.ndim < 3: - continue # нам потрібні щонайменше time/lat/lon + continue dims = list(da.dims) - # шукаємо назву координати рівня серед типових для ERA5/ECMWF level_dim = next((d for d in LEVEL_DIM_CANDIDATES if d in dims), None) if level_dim is None: - # звичайна 3D-змінна без рівнів — як і раніше var_file_map[var] = nc_path continue - # якщо є рівні — додаємо по опції на кожен рівень: var_1000, var_975, ... + # options for each level: var_1000, var_975, ... try: level_vals = ds[level_dim].values except Exception: @@ -563,12 +774,12 @@ def load_env_data(self, *events): for lv in level_vals: try: - # за замовчуванням показуємо цілими hPa (1000, 975, 950 …) + # default - hPa (1000, 975, 950 …) lv_int = int(round(float(lv))) label = f"{var}_{lv_int}" var_file_map[label] = nc_path except Exception: - # якщо рівень нечисловий — пропускаємо конкретне значення + # skip if non-numeric continue finally: @@ -587,14 +798,35 @@ def load_env_data(self, *events): # Variable options if not var_file_map: - self.env_data_multiselect.options = [] + self.env_continuous_selector.options = [] + self.env_categorical_selector.options = [] + self.env_continuous_selector.value = [] + self.env_categorical_selector.value = [] self.status_text = "No 3D variables (e.g. time/lat/lon) found in the file." self.alert.object = self.status_text return + # Store map label -> nc_path self.env_variable_sources = var_file_map - self.env_data_multiselect.options = list(var_file_map.keys()) - self.status_text = f"Loaded {len(var_file_map)} variable(s) from 1 file." + + # labels : continuous vs categorical + all_labels = list(var_file_map.keys()) + # both selectors get ALL variables in options + self.env_continuous_selector.options = all_labels + self.env_categorical_selector.options = all_labels + # reset selections + self.env_continuous_selector.value = [] + self.env_categorical_selector.value = [] + self.status_text = f"Loaded {len(all_labels)} variable(s). Now split them into Continuous vs Categorical/QC." + self.alert.object = self.status_text + self._sync_nc_column_heights() + + + self.status_text = f"Loaded {len(var_file_map)} variable(s)." + self.alert.object = self.status_text + self._sync_nc_column_heights() + #### + self.alert.object = self.status_text self._sync_nc_column_heights() @@ -698,18 +930,38 @@ def load_movement_data(self, *events): self.alert.object = self.status_text self._sync_nc_column_heights() + def _get_selected_env_vars(self): + cont = list(getattr(self.env_continuous_selector, "value", []) or []) + cat = list(getattr(self.env_categorical_selector, "value", []) or []) + seen = set() + out = [] + for v in cont + cat: + if v not in seen: + seen.add(v) + out.append(v) + return out + @try_catch("Error during annotation") def run_annotation(self, *events): self.status_text = "Running annotation..." self.alert.object = self.status_text try: - selected_vars = self.env_data_multiselect.value + continuous_vars = list(getattr(self.env_continuous_selector, "value", []) or []) + categorical_vars = list(getattr(self.env_categorical_selector, "value", []) or []) + # Preserve variable order without duplicates + seen = set() + selected_vars = [] + for v in continuous_vars + categorical_vars: + if v not in seen: + seen.add(v) + selected_vars.append(v) + selected_ids = self.id_multiselect.value env_var_map = getattr(self, "env_variable_sources", {}) movebank_path = self.movement_data_selector.value boundary_path = getattr(self, "boundary_path", None) - interpolation_method = self.interpolation_method.value + interpolation_method = self._normalize_interp_key(self.interpolation_method.value) smoothing_points = int(self.control_smoothing.value) if not selected_vars: @@ -745,10 +997,21 @@ def run_annotation(self, *events): self.status_text = "Annotation started." # pass bbox (or None, if the user did choose shp) + coord_spec = { + "time": self.nc_time_var.value, + "lat": self.nc_lat_var.value, + "lon": self.nc_lon_var.value, + } + if not (self.nc_time_var.value and self.nc_lat_var.value and self.nc_lon_var.value): + self.env_info.object = "Please select Time, Latitude and Longitude variables from the NetCDF file." + return + start_annotation_process( env_var_map, selected_vars, movebank_path, selected_ids, boundary_path, interpolation_method, bbox=bbox, smoothing_k=smoothing_points, - out_csv_path=self.output_path.value + out_csv_path=self.output_path.value, coord_spec=coord_spec, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars ) self.status_text = "Annotation finished." @@ -767,7 +1030,7 @@ def load_env_data_tif(self, *events): Workflow: 1) Validate that the user selected any *.tif in the target folder. - 2) Ensure a Movebank CSV is already selected (used to decide output dir). + 2) Use the TIF folder as the output directory for the generated temporary NetCDF. 3) Convert the set of TIFs in that folder → one NetCDF via `convert_tif_to_nc_before_annotation` (each parsed variable = separate DataArray). 4) Open the produced NetCDF with `safe_open_nc_with_time_decoding` and: @@ -783,11 +1046,11 @@ def load_env_data_tif(self, *events): so we avoid re-reading all `data_vars` again. - `self.tif_nc_path` is stored for fallbacks (e.g., bbox from nc if no boundary). """ - # --- 0) Initial UI/status ---------------------------------------------------- + # 0) Initial UI/status self.status_text = "Loading TIF environmental data..." self.alert.object = self.status_text - # --- 1) Validate a sample TIF and collect folder ----------------------------- + # 1) Validate a sample TIF and collect folder tif_sample_path = Path(getattr(self.tif_env_data_selector, "value", "") or "") if (not tif_sample_path.is_file()) or (tif_sample_path.suffix.lower() != ".tif"): self.status_text = f"Selected path is not a .tif file: {tif_sample_path}" @@ -801,16 +1064,12 @@ def load_env_data_tif(self, *events): self.alert.object = self.status_text return - # --- 2) Ensure Movebank CSV is loaded (for placing the output NetCDF nearby) - - movebank_path = getattr(self.tif_movement_data_selector, "value", None) - if not movebank_path or not Path(str(movebank_path)).is_file(): - self.status_text = "Please load Movebank data before environmental data." - self.alert.object = self.status_text - return - - output_dir = str(Path(str(movebank_path)).parent) + # 2) Write the temporary NetCDF next to the source TIF files. + # Movebank data is not required at this stage. + # The temporary NetCDF is always saved next to the input TIF files. + output_dir = str(folder_path) - # --- 3) Convert TIF stack → NetCDF ------------------------------------------ + # 3) Convert TIF to NetCDF try: nc_path = convert_tif_to_nc_before_annotation(tif_files, output_dir) except Exception as e: @@ -821,7 +1080,7 @@ def load_env_data_tif(self, *events): # Cache for later (bbox fallback, re-open, etc.) self.tif_nc_path = nc_path - # --- 4) Inspect NetCDF and keep ONLY 3D variables with a time dimension ------ + # 4) Inspect NetCDF and keep ONLY 3D variables with a time dimension var_file_map: dict[str, str] = {} time_text = "Time range: -" spatial_text = "Spatial range: -" @@ -873,7 +1132,7 @@ def load_env_data_tif(self, *events): except Exception: pass - # --- 5) Update UI: info panel, multiselect, status --------------------------- + # 5) Update UI: info panel, multiselect, status # Info panel (use common helper to insert/replace rows) self._update_info_lines(self.tif_env_info, { "File:": Path(nc_path).name, @@ -884,24 +1143,47 @@ def load_env_data_tif(self, *events): if not var_file_map: # No valid 3D variables (time/lat/lon) found self.tif_env_var_map = {} + self.tif_env_data_multiselect.options = [] self.tif_env_data_multiselect.value = [] + + self.tif_continuous_vars.options = [] + self.tif_continuous_vars.value = [] + + self.tif_categorical_vars.options = [] + self.tif_categorical_vars.value = [] + self.status_text = "No 3D (time/lat/lon) variables found in the generated NetCDF." self.alert.object = self.status_text return - # Store already filtered variables for later use in run_annotation_tif() + # Save valid TIF variables for annotation. self.tif_env_var_map = var_file_map - - # Options for the multiselect and a default value self.tif_env_data_multiselect.options = var_names - if not self.tif_env_data_multiselect.value: - self.tif_env_data_multiselect.value = var_names[:1] + self.tif_env_data_multiselect.value = [] + + # Populate TIF variable type selectors. + # This is an initial guess only; the user can manually change it. + continuous_guess, categorical_guess = self._guess_tif_variable_types(var_names) + + self.tif_continuous_vars.options = var_names + self.tif_categorical_vars.options = var_names + + self.tif_continuous_vars.value = continuous_guess + self.tif_categorical_vars.value = categorical_guess + + # Update info panel using the actual selected split + selected_for_info = continuous_guess + [ + v for v in categorical_guess + if v not in continuous_guess + ] + self.update_env_info_text_tif(selected_for_info) # Final status self.status_text = ( f"Converted {len(tif_files)} TIF files to NetCDF. " - f"Variables (3D/time): {', '.join(var_names)}" + f"Variables (3D/time): {', '.join(var_names)}. " + "Please check Continuous vs Categorical/QC selection." ) self.alert.object = self.status_text @@ -911,37 +1193,82 @@ def run_annotation_tif(self, *events): """ Run annotation workflow for environmental data sourced from AppEEARS GeoTIFFs. - Steps: - 1) Validate user selections (Movebank CSV, a sample TIF in the target folder, optional boundary). - 2) Gather all *.tif files from the selected folder. - 3) Convert the TIF stack to a single NetCDF via `convert_tif_to_nc_before_annotation` - (this function produces a Dataset with one DataArray per parsed variable). - 4) Read actual variable names from the produced NetCDF and construct `env_var_map` - as {var_name: nc_path}. - 5) Determine which variables to annotate (from the multiselect; default to the first one). - 6) Call `start_annotation_process(...)` with the resolved parameters. - - Notes: - - This function assumes that `convert_tif_to_nc_before_annotation`, `safe_open_nc_with_time_decoding`, - and `start_annotation_process` are already imported. - - It also assumes UI widgets exist on the instance: - * self.tif_movement_data_selector (file path to Movebank CSV) - * self.tif_env_data_selector (a sample TIF inside the desired folder) - * self.tif_env_data_multiselect (variable picker) - * self.id_multiselect or self.tif_id_multiselect (optional animal IDs) - * self.tif_bound_data_selector or self.bound_data_selector (optional boundary file) - * self.tif_interpolation_method or self.interpolation_method (method name) - * self.tif_output_path or self.output_path (optional output CSV path) - - Status messages are written to `self.status_text` and mirrored in `self.alert.object`. + Current TIF workflow: + 1) Validate user selections: + - Movebank CSV is required. + - A sample .tif file is required to identify the target TIF folder. + - Boundary file is optional; if it is not provided, the NetCDF extent is used. + + 2) Gather all *.tif files from the selected TIF folder. + + 3) Convert the TIF stack to a temporary NetCDF via + `convert_tif_to_nc_before_annotation(...)`. + + Important: + - The temporary NetCDF is written to the same folder as the input TIF files. + - The conversion keeps raw raster values. + - No scale factor, add_offset, or automatic 0.0001 heuristic is applied during + TIF -> NetCDF conversion. + + 4) Build `env_var_map` for variables that are valid for annotation: + - variables must have a time dimension; + - variables must be at least 3D, typically variable(time, lat, lon). + + 5) Determine variables to annotate from the explicit type selectors: + - `self.tif_continuous_vars` + - `self.tif_categorical_vars` + + The same variable must not be selected in both lists. + + 6) Run annotation through `start_annotation_process(...)`. + + Continuous variables: + - use the selected spatial interpolation method; + - use linear temporal interpolation; + - may optionally receive post-sampling value correction: + corrected_value = sampled_value * scale_factor + add_offset. + + Categorical/QC variables: + - are sampled using nearest spatial grid cell and nearest available timestep; + - are not IDW-averaged; + - are not linearly interpolated in time; + - are not scaled or offset; + - remain raw category/flag/QC codes. + + 7) Save the annotated output CSV and per-individual CSV files through the backend. + + Required UI widgets: + - `self.tif_movement_data_selector`: + Movebank CSV path. + - `self.tif_env_data_selector`: + one sample .tif file inside the target TIF folder. + - `self.tif_continuous_vars`: + continuous environmental variables selected for annotation. + - `self.tif_categorical_vars`: + categorical/QC variables selected for annotation. + - `self.tif_id_multiselect`: + selected individual IDs. + - `self.tif_bound_data_selector`: + optional boundary file. + - `self.tif_interpolation_method`: + spatial interpolation method for continuous variables. + - `self.tif_control_smoothing`: + number of nearest grid points for IDW. + - `self.tif_apply_scale`, `self.tif_scale_factor`, `self.tif_add_offset`: + optional post-sampling correction for continuous variables only. + - `self.tif_output_path`: + output CSV path. + + Status messages are written to `self.status_text` and mirrored in `self.alert.object`. """ self.status_text = "Starting annotation (TIF)…" self.alert.object = self.status_text - # --- 0) Validate inputs --- + # 0) Validate inputs # Movebank CSV (required) movebank_path = getattr(self.tif_movement_data_selector, "value", None) if not movebank_path or not Path(str(movebank_path)).is_file(): - self.status_text = "Please load Movebank data before environmental data." + self.status_text = "Please load Movebank data before running TIF annotation." self.alert.object = self.status_text return @@ -959,8 +1286,9 @@ def run_annotation_tif(self, *events): id_widget = getattr(self, "tif_id_multiselect", None)# or getattr(self, "id_multiselect", None) selected_ids = list(getattr(id_widget, "value", [])) if id_widget else [] if not selected_ids: - # Not critical—downstream may annotate all IDs or handle empty list. - print("[WARN] No IDs selected; proceeding without explicit ID filtering.") + self.status_text = "Please select at least one individual ID before running TIF annotation." + self.alert.object = self.status_text + return # Optional boundary bound_widget = getattr(self, "tif_bound_data_selector", None)# or getattr(self, "bound_data_selector", None) @@ -971,13 +1299,14 @@ def run_annotation_tif(self, *events): # Interpolation and time-fit options (prefer TIF-tab widgets; fallback to NC-tab) interp_widget = getattr(self, "tif_interpolation_method", None) - interp_method = getattr(interp_widget, "value", "Nearest neighbor (time-linear)") - + #??? interp_method = getattr(interp_widget, "value", "Nearest neighbor (time-linear)") + ui_method = getattr(interp_widget, "value", "Nearest neighbor (time-linear)") + interp_method = self._normalize_interp_key(ui_method) # Output CSV path (optional) out_widget = getattr(self, "tif_output_path", None) output_csv_path = getattr(out_widget, "value", None) - # --- 1) Collect TIFs from the selected folder ----------------------------- + # 1) Collect TIFs from the selected folder folder_path = Path(tif_sample).parent tif_paths = sorted(p for p in folder_path.glob("*.tif") if p.is_file()) if not tif_paths: @@ -985,13 +1314,13 @@ def run_annotation_tif(self, *events): self.alert.object = self.status_text return - # --- 2) Convert TIF → NetCDF (multi-variable) ----------------------------- - output_dir = str(Path(movebank_path).parent) + # 2) Convert TIF → NetCDF (multi-variable, raw values only) + # Scale/offset is not applied here; optional correction is applied after sampling. + output_dir = str(folder_path) nc_path = convert_tif_to_nc_before_annotation([str(p) for p in tif_paths], output_dir) - self.tif_nc_path = nc_path # cache for later use + self.tif_nc_path = nc_path - # --- 3) Read variables from NetCDF and build env_var_map ------------------ - # Prefer already-filtered map from load_env_data_tif (only 3D with 'time') + # 3) Read valid variables from NetCDF and build env_var_map if getattr(self, "tif_env_var_map", None): env_var_map = dict(self.tif_env_var_map) var_names = list(env_var_map.keys()) @@ -1018,17 +1347,41 @@ def run_annotation_tif(self, *events): self.alert.object = self.status_text return - # --- 4) Which variables to annotate? -------------------------------------- - ms_widget = getattr(self, "tif_env_data_multiselect", None) - selected_vars = list(getattr(ms_widget, "value", [])) if ms_widget else [] + # 4) Which variables to annotate? + continuous_vars = list(getattr(self.tif_continuous_vars, "value", []) or []) + categorical_vars = list(getattr(self.tif_categorical_vars, "value", []) or []) + + overlap = set(continuous_vars) & set(categorical_vars) + if overlap: + self.status_text = ( + "The same variable cannot be selected as both Continuous and Categorical/QC: " + + ", ".join(sorted(overlap)) + ) + self.alert.object = self.status_text + return + + selected_vars = continuous_vars + [ + v for v in categorical_vars + if v not in continuous_vars + ] + if not selected_vars: - selected_vars = var_names[:1] # default to the first variable - if ms_widget: - ms_widget.value = selected_vars # sync UI state + self.status_text = "Please select at least one Continuous or Categorical/QC variable." + self.alert.object = self.status_text + return + + # 5) Kick off annotation + scale_msg = ( + f"scale={self.tif_scale_factor.value}, offset={self.tif_add_offset.value}" + if self.tif_apply_scale.value + else "off" + ) - # --- 5) Kick off annotation ------------------------------------------------ self.status_text = ( f"Annotating variables: {', '.join(selected_vars)} | " + f"Continuous: {', '.join(continuous_vars) if continuous_vars else '-'} | " + f"Categorical/QC: {', '.join(categorical_vars) if categorical_vars else '-'} | " + f"Scale/offset: {scale_msg} | " f"IDs: {len(selected_ids) if selected_ids else 'all/unspecified'} | " f"Interpolation: {interp_method}" ) @@ -1060,9 +1413,17 @@ def run_annotation_tif(self, *events): selected_ids=selected_ids, boundary_path=str(boundary_path) if boundary_path else None, interpolation_method=interp_method, - bbox=bbox, - smoothing_k=int(self.tif_control_smoothing.value), - out_csv_path=output_csv_path + bbox=bbox, + smoothing_k=int(self.tif_control_smoothing.value), + out_csv_path=output_csv_path, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars, + # TIF value correction is applied after sampling, + # and only to continuous variables. + apply_value_correction=bool(self.tif_apply_scale.value), + value_scale_factor=float(self.tif_scale_factor.value), + value_add_offset=float(self.tif_add_offset.value), + value_correction_vars=continuous_vars, ) self.status_text = "Annotation finished successfully (TIF)." self.alert.object = self.status_text @@ -1176,7 +1537,7 @@ def run_interpolate_missing_only(self, *events): self.alert.object = self.status_text return - # 2) Determine the ID: if the user did not choose, we take all + # 2) Determine the ID: if the user did not choose, take all if self.df is None: try: df_tmp = pd.read_csv(csv_path) @@ -1200,12 +1561,10 @@ def run_interpolate_missing_only(self, *events): start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S.%f") end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S.%f") - # 4) Which columns to interpolate: taken from your validating function + # 4) Which columns to interpolate columns = validate_and_process_csv(csv_path) # 5) Call simplified interpolation - # if you replaced check_missing_values_only -> it now interpolates, - # otherwise import interpolate_missing_values_only and call it here. out_template = self.out_csv_name.value created = interpolate_missing_values_only( start_time_str, end_time_str, csv_path, selected_ids, columns, out_template @@ -1283,6 +1642,71 @@ def update_env_info_text_tif(self, selected_vars): self.tif_env_info.object = "
".join(updated) + def _guess_tif_variable_types(self, variables): + """ + Return initial (continuous, categorical) split for TIF-derived variables. + This is only a first guess. The user can manually change the selection. + """ + categorical_keywords = [ + "qc", + "quality", + "flag", + "mask", + "class", + "category", + "categorical", + "landcover", + "land_cover", + "classification", + "type", + ] + + categorical = [ + v for v in variables + if any(key in str(v).lower() for key in categorical_keywords) + ] + + continuous = [ + v for v in variables + if v not in categorical + ] + + return continuous, categorical + + def _sync_tif_variable_type_selection(self, event=None): + """ + Ensure that the same TIF-derived variable cannot be selected + as both continuous and categorical/QC. + """ + if getattr(self, "_syncing_tif_var_types", False): + return + + self._syncing_tif_var_types = True + try: + continuous = set(self.tif_continuous_vars.value or []) + categorical = set(self.tif_categorical_vars.value or []) + + overlap = continuous & categorical + if not overlap: + return + + # If the user changed Continuous, remove overlap from Categorical/QC. + if event is not None and event.obj is self.tif_continuous_vars: + self.tif_categorical_vars.value = [ + v for v in (self.tif_categorical_vars.value or []) + if v not in overlap + ] + + # If the user changed Categorical/QC, remove overlap from Continuous. + elif event is not None and event.obj is self.tif_categorical_vars: + self.tif_continuous_vars.value = [ + v for v in (self.tif_continuous_vars.value or []) + if v not in overlap + ] + + finally: + self._syncing_tif_var_types = False + def update_movement_info_text_tif(self, section, new_values): current = self.tif_movement_info.object or "" if not current: @@ -1344,8 +1768,8 @@ def _auto_height(self, pane, line_px=22, padding=8): def _update_smoothing_options(self, event): - """Updates options for control_smoothing depending on interpolation method (.nc).""" - if event.new.startswith("Nearest neighbor"): + key = self._normalize_interp_key(event.new) + if key == "nearest": self.control_smoothing.options = ["1"] self.control_smoothing.value = "1" else: @@ -1353,10 +1777,17 @@ def _update_smoothing_options(self, event): if self.control_smoothing.value == "1": self.control_smoothing.value = "4" + def _update_tif_scale_widgets(self, event=None): + """ + Enable scale factor / offset inputs only when post-sampling value correction is enabled. + """ + enabled = bool(self.tif_apply_scale.value) + self.tif_scale_factor.disabled = not enabled + self.tif_add_offset.disabled = not enabled def _update_smoothing_options_tif(self, event): - """Updates options for control_smoothing depending on interpolation method(.tif).""" - if event.new.startswith("Nearest neighbor"): + key = self._normalize_interp_key(event.new) + if key == "nearest": self.tif_control_smoothing.options = ["1"] self.tif_control_smoothing.value = "1" else: diff --git a/ecodata/app/apps/gridded_data_explorer_app.py b/ecodata/app/apps/gridded_data_explorer_app.py index 0acb05d..e81911e 100644 --- a/ecodata/app/apps/gridded_data_explorer_app.py +++ b/ecodata/app/apps/gridded_data_explorer_app.py @@ -163,8 +163,10 @@ class GriddedDataExplorer(param.Parameterized): ) # Progress bar and percent for saving - progress_indicator = param.ClassSelector(pn.indicators.Progress) - progress_percent = param.ClassSelector(pn.widgets.StaticText) + #progress_indicator = param.ClassSelector(pn.indicators.Progress) + progress_indicator = param.ClassSelector(class_=pn.indicators.Progress) + #progress_percent = param.ClassSelector(pn.widgets.StaticText) + progress_percent = param.ClassSelector(class_=pn.widgets.StaticText) # Save statistics stats_fname = param_widget( diff --git a/ecodata/app/apps/multidimensional_annotation_app.py b/ecodata/app/apps/multidimensional_annotation_app.py new file mode 100644 index 0000000..3bf2c4a --- /dev/null +++ b/ecodata/app/apps/multidimensional_annotation_app.py @@ -0,0 +1,1017 @@ +""" +Multidimensional Annotation UI +""" + +from __future__ import annotations + +import logging +import re +from pathlib import Path +from typing import List, Optional, Tuple + +import panel as pn +import pandas as pd + +try: + import xarray as xr +except Exception: # pragma: no cover + xr = None + +try: + import geopandas as gpd +except Exception: # pragma: no cover + gpd = None + +from ecodata.app.config import DEFAULT_TEMPLATE +from ecodata.app.models import FileSelector +from ecodata.panel_utils import register_view + +try: + from ecodata.annotation_eng_func import safe_open_nc_with_time_decoding, load_vector_extent_info +except Exception: # pragma: no cover + safe_open_nc_with_time_decoding = None + load_vector_extent_info = None + +try: + from ecodata.multidim_annotation_func import run_multidimensional_annotation_from_paths +except Exception as exc: # pragma: no cover + run_multidimensional_annotation_from_paths = None + BACKEND_IMPORT_ERROR = exc +else: + BACKEND_IMPORT_ERROR = None + +logger = logging.getLogger(__name__) + + +class Multidimensional_Annotation_App: + def __init__(self): + self.name = "Multidimensional Annotation Engine App (DEMO)" + self._movement_columns: List[str] = [] + self._movement_df: Optional[pd.DataFrame] = None + self.boundary_path: Optional[str] = None + + def make_file_selector(name: str, file_pattern: str = "*") -> FileSelector: + return FileSelector( + name=name, + directory=str(Path.home()), + file_pattern=file_pattern, + only_files=True, + constrain_path=False, + expanded=True, + size=10, + sizing_mode="stretch_width", + ) + + self.movement_csv = make_file_selector("Movement CSV", "*.csv") + self.load_movement_button = pn.widgets.Button(name="Load movement data", button_type="primary", sizing_mode="stretch_width") + + self.taxon_multiselect = pn.widgets.MultiSelect( + name="Select Taxon (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + self.id_multiselect = pn.widgets.MultiSelect( + name="Select ID (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + + self.id_column = pn.widgets.Select(name="ID column", options=[], value=None, sizing_mode="stretch_width") + self.time_column = pn.widgets.Select(name="Timestamp column", options=[], value=None, sizing_mode="stretch_width") + self.lat_column = pn.widgets.Select(name="Latitude column", options=[], value=None, sizing_mode="stretch_width") + self.lon_column = pn.widgets.Select(name="Longitude column", options=[], value=None, sizing_mode="stretch_width") + self.height_column = pn.widgets.Select(name="Height / altitude column", options=[], value=None, sizing_mode="stretch_width") + self.height_units = pn.widgets.Select(name="Height units", options=["m"], value="m", sizing_mode="stretch_width") + self.height_reference = pn.widgets.Select( + name="Height reference", + options=[ + "WGS84 ellipsoidal height (Movebank GPS height)", + "Already orthometric / MSL-like", + "Height above ground level (requires DEM)", + ], + value="WGS84 ellipsoidal height (Movebank GPS height)", + sizing_mode="stretch_width", + ) + self.geoid_mode = pn.widgets.Select( + name="Geoid correction", options=["geographiclib", "constant", "none"], value="geographiclib", sizing_mode="stretch_width" + ) + self.constant_geoid_undulation_m = pn.widgets.FloatInput( + name="Constant geoid undulation N, m", value=0.0, step=1.0, sizing_mode="stretch_width" + ) + self.movement_info = pn.pane.HTML( + "File: not selected
Taxons: -
IDs: -
Time range: -
Spatial range: -
", + sizing_mode="stretch_width", + ) + + self.geopotential_file = make_file_selector("Geopotential file", "*.nc") + self.scan_geopotential_button = pn.widgets.Button(name="Scan geopotential file", button_type="primary", sizing_mode="stretch_width") + self.geopotential_variable = pn.widgets.Select(name="Geopotential variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_time_var = pn.widgets.Select(name="Time variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_lat_var = pn.widgets.Select(name="Latitude variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_lon_var = pn.widgets.Select(name="Longitude variable", options=[], value=None, sizing_mode="stretch_width") + self.nc_level_var = pn.widgets.Select(name="Vertical / level variable", options=[], value=None, sizing_mode="stretch_width") + self.geopotential_units = pn.widgets.Select( + name="Geopotential units", options=["m2 s-2", "geopotential metres"], value="m2 s-2", sizing_mode="stretch_width" + ) + self.convert_geopotential_to_height = pn.widgets.Checkbox( + name="Convert geopotential to height using z / 9.80665", value=True, sizing_mode="stretch_width" + ) + self.gravity_constant = pn.widgets.FloatInput(name="Gravity constant", value=9.80665, step=0.00001, disabled=True) + + self.multilevel_var_file = make_file_selector("Annotated var (multilevel) file", "*.nc") + self.scan_multilevel_button = pn.widgets.Button(name="Scan multilevel file", button_type="primary", sizing_mode="stretch_width") + self.multilevel_variable = pn.widgets.Select(name="Annotated var (multilevel, first selected)", options=[], value=None, sizing_mode="stretch_width") + self.multilevel_continuous_vars = pn.widgets.MultiSelect( + name="Continuous multilevel variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=180, sizing_mode="stretch_width" + ) + self.multilevel_categorical_vars = pn.widgets.MultiSelect( + name="Categorical/QC multilevel variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=180, sizing_mode="stretch_width" + ) + + self.surface_var_file = make_file_selector("Annotated var (surface) file", "*.nc") + self.scan_surface_button = pn.widgets.Button(name="Scan surface file", button_type="default", sizing_mode="stretch_width") + self.surface_variable = pn.widgets.Select(name="Annotated var (surface, first selected)", options=[], value=None, sizing_mode="stretch_width") + self.surface_continuous_vars = pn.widgets.MultiSelect( + name="Continuous surface variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + self.surface_categorical_vars = pn.widgets.MultiSelect( + name="Categorical/QC surface variables (use Ctrl or ⌘ for multiple)", options=[], value=[], height=140, sizing_mode="stretch_width" + ) + self.env_info = pn.pane.HTML( + "File: not selected
Multilevel parameters: -
Surface parameters: -
Time range: -
Spatial range: -
Vertical levels: -
", + sizing_mode="stretch_width", + ) + + self.boundary_file = make_file_selector("Boundary data (.shp/.geojson)", "*") + self.load_boundary_button = pn.widgets.Button(name="Load boundary data", button_type="primary", sizing_mode="stretch_width") + self.reset_boundary_button = pn.widgets.Button(name="(!) Reset boundary", button_type="primary", sizing_mode="stretch_width") + self.boundary_info = pn.pane.HTML( + "Boundary file: not selected
Spatial range: = environmental data boundary", sizing_mode="stretch_width" + ) + + self.spatial_interpolation_method = pn.widgets.Select( + name="Spatial interpolation method", + options=["Nearest neighbor", "Inverse Distance Weighting"], + value="Nearest neighbor", + sizing_mode="stretch_width", + ) + self.control_smoothing = pn.widgets.Select( + name="Number of nearest grid points", options=["1", "2", "4", "6", "8"], value="1", sizing_mode="stretch_width" + ) + self.vertical_matching_method = pn.widgets.Select( + name="Vertical matching method", + options=["Nearest geopotential-height level", "Linear vertical interpolation"], + value="Nearest geopotential-height level", + sizing_mode="stretch_width", + ) + self.use_surface_as_lower_anchor = pn.widgets.Checkbox( + name="Use surface variable as lower vertical anchor", value=True, sizing_mode="stretch_width" + ) + self.surface_anchor_height_agl_m = pn.widgets.FloatInput( + name="Surface anchor height above ground, m", value=2.0, step=0.5, sizing_mode="stretch_width" + ) + + self.u_file = make_file_selector("U wind component file", "*.nc") + self.u_variable = pn.widgets.Select(name="U variable", options=[], value=None, sizing_mode="stretch_width") + self.v_file = make_file_selector("V wind component file", "*.nc") + self.v_variable = pn.widgets.Select(name="V variable", options=[], value=None, sizing_mode="stretch_width") + self.w_file = make_file_selector("W vertical velocity file", "*.nc") + self.w_variable = pn.widgets.Select(name="W variable", options=[], value=None, sizing_mode="stretch_width") + self.temperature_file = make_file_selector("Temperature file", "*.nc") + self.temperature_variable = pn.widgets.Select(name="Temperature variable", options=[], value=None, sizing_mode="stretch_width") + self.scan_optional_components_button = pn.widgets.Button( + name="Scan optional component files", button_type="default", sizing_mode="stretch_width" + ) + + self.topography_source = pn.widgets.Select( + name="Topography source", + options=["None", "ETOPO1 Ice Surface Global Relief Model", "SRTM 1 Arc-Second DEM", "ASTER ASTGTM3 Global 30-m DEM", "Custom DEM / GeoTIFF"], + value="None", + sizing_mode="stretch_width", + ) + self.dem_file = make_file_selector("DEM file", "*.tif") + self.dem_units = pn.widgets.Select(name="DEM vertical units", options=["m"], value="m", disabled=True, sizing_mode="stretch_width") + self.dem_reference = pn.widgets.Select( + name="DEM reference", options=["Assumed orthometric / MSL-like"], value="Assumed orthometric / MSL-like", disabled=True, sizing_mode="stretch_width" + ) + + self.derive_wind_support_crosswind = pn.widgets.Checkbox( + name="Wind support and cross wind", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_wind_speed_direction = pn.widgets.Checkbox( + name="Wind speed and direction", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_vertical_motion = pn.widgets.Checkbox( + name="Vertical motion from W", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_thermal_uplift = pn.widgets.Checkbox( + name="Thermal uplift / stability proxy", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.derive_orographic_uplift = pn.widgets.Checkbox( + name="Orographic uplift", value=False, disabled=True, sizing_mode="stretch_width" + ) + self.track_direction_source = pn.widgets.Select( + name="Track direction source", + options=["Compute from consecutive points", "Use existing heading column"], + value="Compute from consecutive points", + disabled=True, + sizing_mode="stretch_width", + ) + self.heading_column = pn.widgets.Select(name="Heading column", options=[], value=None, disabled=True, sizing_mode="stretch_width") + + self.output_csv = pn.widgets.TextInput( + name="Output CSV", value=str(Path.home() / "Downloads" / "multidimensional_annotation_output.csv"), sizing_mode="stretch_width" + ) + self.save_per_individual = pn.widgets.Checkbox(name="Save per individual", value=True, sizing_mode="stretch_width") + self.keep_diagnostics = pn.widgets.Checkbox(name="Keep diagnostic columns", value=True, sizing_mode="stretch_width") + self.validate_button = pn.widgets.Button(name="Validate configuration", button_type="primary", sizing_mode="stretch_width") + self.run_button = pn.widgets.Button(name="Run multidimensional annotation", button_type="primary", sizing_mode="stretch_width") + + self.preview = pn.pane.Markdown("### Preview\nNo files scanned yet.", sizing_mode="stretch_width", styles=self._pane_style()) + self.validation = pn.pane.Markdown("### Validation\nNot validated yet.", sizing_mode="stretch_width", styles=self._pane_style()) + self.log = pn.pane.Markdown("### Log\nReady.", sizing_mode="stretch_width", styles=self._pane_style()) + + self.load_movement_button.on_click(self._on_load_movement) + self.scan_geopotential_button.on_click(self._on_scan_geopotential) + self.scan_multilevel_button.on_click(self._on_scan_multilevel) + self.scan_surface_button.on_click(self._on_scan_surface) + self.scan_optional_components_button.on_click(self._on_scan_optional_components) + self.load_boundary_button.on_click(self._on_load_boundary) + self.reset_boundary_button.on_click(self._on_reset_boundary) + self.validate_button.on_click(self._on_validate) + self.run_button.on_click(self._on_run) + self.taxon_multiselect.param.watch(self._update_ids_by_taxon, "value") + self.spatial_interpolation_method.param.watch(self._update_smoothing_options, "value") + + for widget in ( + self.u_file, + self.v_file, + self.w_file, + self.temperature_file, + self.dem_file, + self.topography_source, + self.track_direction_source, + ): + widget.param.watch(self._update_dynamic_states, "value") + + self._wire_variable_split_guards() + self._update_smoothing_options() + self._update_dynamic_states() + + @staticmethod + def _pane_style(): + return {"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"} + + def _append_log(self, message: str) -> None: + old = self.log.object or "### Log\n" + if old.strip() == "### Log\nReady.": + old = "### Log\n" + self.log.object = old + f"\n- {message}" + + @staticmethod + def _file_value(value): + if value is None: + return None + if isinstance(value, (list, tuple, set)): + values = list(value) + return values[0] if values else None + return value + + @staticmethod + def _path_exists(value: str) -> bool: + value = Multidimensional_Annotation_App._file_value(value) + if not value: + return False + path = Path(str(value)).expanduser() + return path.exists() and path.is_file() + + @staticmethod + def _optional_path(value): + value = Multidimensional_Annotation_App._file_value(value) + if not value: + return None + path = Path(str(value)).expanduser() + return str(path) if path.exists() and path.is_file() else None + + @staticmethod + def _guess_column(columns: List[str], candidates: List[str]) -> Optional[str]: + lower_map = {c.lower(): c for c in columns} + normalized_map = {re.sub(r"[-:._\s]+", "_", c.lower()): c for c in columns} + for cand in candidates: + c1 = cand.lower() + c2 = re.sub(r"[-:._\s]+", "_", c1) + if c1 in lower_map: + return lower_map[c1] + if c2 in normalized_map: + return normalized_map[c2] + for col in columns: + cl = col.lower() + cn = re.sub(r"[-:._\s]+", "_", cl) + if any(cand.lower() in cl or re.sub(r"[-:._\s]+", "_", cand.lower()) in cn for cand in candidates): + return col + return columns[0] if columns else None + + @staticmethod + def _open_dataset_for_scan(path): + if xr is None: + raise RuntimeError("xarray is not available.") + if safe_open_nc_with_time_decoding is not None: + return safe_open_nc_with_time_decoding(path) + return xr.open_dataset(path, decode_times=False) + + @staticmethod + def _read_nc_metadata(path_value: str) -> Tuple[List[str], List[str], dict]: + path_value = Multidimensional_Annotation_App._file_value(path_value) + if not path_value: + raise FileNotFoundError("No NetCDF file selected.") + path = Path(str(path_value)).expanduser() + if not path.exists() or not path.is_file(): + raise FileNotFoundError(f"File not found or not a file: {path}") + if path.suffix.lower() != ".nc": + raise ValueError(f"Expected a .nc file, got: {path}") + + ds = Multidimensional_Annotation_App._open_dataset_for_scan(path) + try: + data_vars = sorted([str(v) for v in ds.data_vars]) + all_names = sorted([str(v) for v in ds.variables]) + meta = Multidimensional_Annotation_App._nc_info(ds) + return data_vars, all_names, meta + finally: + try: + ds.close() + except Exception: + pass + + @staticmethod + def _nc_info(ds) -> dict: + names = set(ds.variables) | set(ds.coords) + time_name = next((c for c in ("time", "valid_time", "forecast_time", "verification_time", "datetime", "date") if c in names), None) + lat_name = next((c for c in ("lat", "latitude", "y") if c in names), None) + lon_name = next((c for c in ("lon", "longitude", "long", "x") if c in names), None) + level_name = next((c for c in ("level", "lev", "plev", "pressure", "pressure_level", "isobaricInhPa", "isobaric_in_hPa") if c in names), None) + + time_text = "-" + spatial_text = "-" + level_text = "-" + + if time_name and time_name in ds: + try: + vals = pd.to_datetime(ds[time_name].values) + time_text = f"{vals.min():%Y-%m-%d %H:%M:%S} — {vals.max():%Y-%m-%d %H:%M:%S}" + except Exception: + pass + + if lat_name and lon_name and lat_name in ds and lon_name in ds: + try: + spatial_text = ( + f"lat[{float(ds[lat_name].min()):.3f}..{float(ds[lat_name].max()):.3f}], " + f"lon[{float(ds[lon_name].min()):.3f}..{float(ds[lon_name].max()):.3f}]" + ) + except Exception: + pass + + if level_name and level_name in ds: + try: + vals = ds[level_name].values + if len(vals) <= 20: + level_text = ", ".join([str(v) for v in vals]) + else: + level_text = f"{len(vals)} levels, {vals[0]} … {vals[-1]}" + except Exception: + pass + + return { + "time_name": time_name, + "lat_name": lat_name, + "lon_name": lon_name, + "level_name": level_name, + "time_text": time_text, + "spatial_text": spatial_text, + "level_text": level_text, + } + + def _set_coord_selectors(self, all_names: List[str], meta: dict) -> None: + for widget in (self.nc_time_var, self.nc_lat_var, self.nc_lon_var, self.nc_level_var): + widget.options = all_names + self.nc_time_var.value = meta.get("time_name") if meta.get("time_name") in all_names else None + self.nc_lat_var.value = meta.get("lat_name") if meta.get("lat_name") in all_names else None + self.nc_lon_var.value = meta.get("lon_name") if meta.get("lon_name") in all_names else None + self.nc_level_var.value = meta.get("level_name") if meta.get("level_name") in all_names else None + + def _update_env_info(self, file_name="-", meta=None) -> None: + multilevel = self._unique_values(self.multilevel_continuous_vars.value, self.multilevel_categorical_vars.value) + surface = self._unique_values(self.surface_continuous_vars.value, self.surface_categorical_vars.value) + meta = meta or {} + self.env_info.object = ( + f"File: {file_name}
" + f"Multilevel parameters: {', '.join(multilevel) if multilevel else '-'}
" + f"Surface parameters: {', '.join(surface) if surface else '-'}
" + f"Time range: {meta.get('time_text', '-')}
" + f"Spatial range: {meta.get('spatial_text', '-')}
" + f"Vertical levels: {meta.get('level_text', '-')}
" + ) + + @staticmethod + def _unique_values(*lists): + out, seen = [], set() + for values in lists: + for v in list(values or []): + if v not in seen: + seen.add(v) + out.append(v) + return out + + def _on_load_movement(self, event=None) -> None: + path_value = self._file_value(self.movement_csv.value) + if not path_value: + self._append_log("No movement CSV selected.") + return + + path = Path(str(path_value)).expanduser() + if not path.exists() or not path.is_file(): + self._append_log(f"Movement CSV does not exist or is not a file: {path}") + return + + try: + df_sample = pd.read_csv(path, nrows=100) + full_df = pd.read_csv(path) + except Exception as exc: + self._append_log(f"Failed to read movement CSV: {exc}") + return + + columns = list(df_sample.columns) + self._movement_columns = columns + self._movement_df = full_df + + for widget in (self.id_column, self.time_column, self.lat_column, self.lon_column, self.height_column, self.heading_column): + widget.options = columns + + self.id_column.value = self._guess_column(columns, ["individual_local_identifier", "individual-local-identifier", "id"]) + self.time_column.value = self._guess_column(columns, ["timestamp", "eobs_start_timestamp", "time", "datetime", "date"]) + self.lat_column.value = self._guess_column(columns, ["location_lat", "location-lat", "lat", "latitude"]) + self.lon_column.value = self._guess_column(columns, ["location_lon", "location_long", "location-long", "lon", "longitude"]) + self.height_column.value = self._guess_column(columns, ["height-above-ellipsoid", "height_above_ellipsoid", "height", "altitude", "elevation", "height_above_msl"]) + self.heading_column.value = self._guess_column(columns, ["heading", "bearing", "direction"]) + + self._populate_taxa_ids_and_info(path, full_df) + self._append_log(f"Movement CSV loaded: {len(columns)} column(s) detected.") + self._update_dynamic_states() + self._refresh_preview() + + def _populate_taxa_ids_and_info(self, path: Path, df: pd.DataFrame) -> None: + id_col = self.id_column.value + taxon_col = self._guess_column(list(df.columns), ["individual-taxon-canonical-name", "individual_taxon_canonical_name", "taxon", "species"]) + + ids = sorted(df[id_col].dropna().astype(str).unique()) if id_col and id_col in df.columns else [] + taxa = sorted(df[taxon_col].dropna().astype(str).unique()) if taxon_col and taxon_col in df.columns else [] + + self.id_multiselect.options = ids + self.id_multiselect.value = ids + self.taxon_multiselect.options = taxa + self.taxon_multiselect.value = [] + + time_text = "-" + if self.time_column.value and self.time_column.value in df.columns: + ts = pd.to_datetime(df[self.time_column.value], errors="coerce", dayfirst=True) + if ts.notna().any(): + time_text = f"{ts.min():%Y-%m-%d %H:%M:%S} — {ts.max():%Y-%m-%d %H:%M:%S}" + + spatial_text = "-" + if self.lat_column.value in df.columns and self.lon_column.value in df.columns: + lat = pd.to_numeric(df[self.lat_column.value], errors="coerce") + lon = pd.to_numeric(df[self.lon_column.value], errors="coerce") + if lat.notna().any() and lon.notna().any(): + spatial_text = f"lat[{float(lat.min()):.3f}..{float(lat.max()):.3f}], lon[{float(lon.min()):.3f}..{float(lon.max()):.3f}]" + + self.movement_info.object = ( + f"File: {path.name}
Taxons: {len(taxa)}
IDs: {len(ids)}
" + f"Time range: {time_text}
Spatial range: {spatial_text}
" + ) + + def _update_ids_by_taxon(self, event=None) -> None: + if self._movement_df is None: + return + df = self._movement_df + id_col = self.id_column.value + if not id_col or id_col not in df.columns: + return + taxon_col = self._guess_column(list(df.columns), ["individual-taxon-canonical-name", "individual_taxon_canonical_name", "taxon", "species"]) + selected_taxa = list(self.taxon_multiselect.value or []) + if selected_taxa and taxon_col and taxon_col in df.columns: + ids = sorted(df.loc[df[taxon_col].astype(str).isin(selected_taxa), id_col].dropna().astype(str).unique()) + else: + ids = sorted(df[id_col].dropna().astype(str).unique()) + self.id_multiselect.options = ids + self.id_multiselect.value = ids + self._refresh_preview() + + def _on_scan_geopotential(self, event=None) -> None: + try: + vars_, all_names, meta = self._read_nc_metadata(self.geopotential_file.value) + except Exception as exc: + self._append_log(f"Failed to scan geopotential file: {exc}") + return + self.geopotential_variable.options = vars_ + self.geopotential_variable.value = vars_[0] if vars_ else None + self._set_coord_selectors(all_names, meta) + self._append_log(f"Scanned geopotential file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _on_scan_multilevel(self, event=None) -> None: + try: + vars_, all_names, meta = self._read_nc_metadata(self.multilevel_var_file.value) + except Exception as exc: + self._append_log(f"Failed to scan multilevel file: {exc}") + return + self.multilevel_variable.options = vars_ + self.multilevel_variable.value = vars_[0] if vars_ else None + self.multilevel_continuous_vars.options = vars_ + self.multilevel_categorical_vars.options = vars_ + self.multilevel_continuous_vars.value = vars_ + self.multilevel_categorical_vars.value = [] + if not self.nc_time_var.options: + self._set_coord_selectors(all_names, meta) + self._update_env_info(Path(str(self._file_value(self.multilevel_var_file.value))).name, meta) + self._append_log(f"Scanned multilevel file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _on_scan_surface(self, event=None) -> None: + try: + vars_, _, meta = self._read_nc_metadata(self.surface_var_file.value) + except Exception as exc: + self._append_log(f"Failed to scan surface file: {exc}") + return + self.surface_variable.options = vars_ + self.surface_variable.value = vars_[0] if vars_ else None + self.surface_continuous_vars.options = vars_ + self.surface_categorical_vars.options = vars_ + self.surface_continuous_vars.value = vars_ + self.surface_categorical_vars.value = [] + self._update_env_info(Path(str(self._file_value(self.surface_var_file.value))).name, meta) + self._append_log(f"Scanned surface file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _scan_nc_to_select(self, path_widget, select_widget, label: str) -> None: + if not self._path_exists(path_widget.value): + return + try: + vars_, _, _ = self._read_nc_metadata(path_widget.value) + except Exception as exc: + self._append_log(f"Failed to scan {label} file: {exc}") + return + select_widget.options = vars_ + select_widget.value = vars_[0] if vars_ else None + self._append_log(f"Scanned {label} file: {len(vars_)} variable(s) found.") + self._refresh_preview() + + def _on_scan_optional_components(self, event=None) -> None: + for path_widget, select_widget, label in [ + (self.u_file, self.u_variable, "U component"), + (self.v_file, self.v_variable, "V component"), + (self.w_file, self.w_variable, "W component"), + (self.temperature_file, self.temperature_variable, "temperature"), + ]: + if self._path_exists(path_widget.value): + self._scan_nc_to_select(path_widget, select_widget, label) + self._update_dynamic_states() + + def _on_load_boundary(self, event=None) -> None: + path = self._optional_path(self.boundary_file.value) + if not path: + self._append_log("No boundary file selected.") + return + try: + if load_vector_extent_info is not None: + loaded_path, south, north, west, east = load_vector_extent_info(path) + else: + if gpd is None: + raise RuntimeError("geopandas is not available.") + gdf = gpd.read_file(path) + west, south, east, north = gdf.total_bounds + loaded_path = path + self.boundary_path = str(loaded_path) + self.boundary_info.object = ( + f"Boundary file: {Path(loaded_path).name}
" + f"Spatial range: lat[{south:.3f}..{north:.3f}], lon[{west:.3f}..{east:.3f}]" + ) + self._append_log(f"Boundary loaded: {Path(loaded_path).name}") + except Exception as exc: + self._append_log(f"Failed to load boundary: {exc}") + + def _on_reset_boundary(self, event=None) -> None: + self.boundary_path = None + self.boundary_info.object = "Boundary file: not selected
Spatial range: = environmental data boundary" + self._append_log("Boundary reset to environmental data boundary.") + + def _update_smoothing_options(self, event=None) -> None: + if self.spatial_interpolation_method.value == "Nearest neighbor": + self.control_smoothing.options = ["1"] + self.control_smoothing.value = "1" + else: + self.control_smoothing.options = ["2", "4", "6", "8"] + if self.control_smoothing.value not in self.control_smoothing.options: + self.control_smoothing.value = "4" + + def _update_dynamic_states(self, *_events) -> None: + has_u = self._path_exists(self.u_file.value) + has_v = self._path_exists(self.v_file.value) + has_w = self._path_exists(self.w_file.value) + has_t = self._path_exists(self.temperature_file.value) + has_dem = self.topography_source.value != "None" and self._path_exists(self.dem_file.value) + + self.dem_units.disabled = self.topography_source.value == "None" + self.dem_reference.disabled = self.topography_source.value == "None" + + wind_ready = has_u and has_v + self.derive_wind_speed_direction.disabled = not wind_ready + self.derive_wind_support_crosswind.disabled = not wind_ready + self.track_direction_source.disabled = not wind_ready + self.heading_column.disabled = not (wind_ready and self.track_direction_source.value == "Use existing heading column") + self.derive_vertical_motion.disabled = not has_w + self.derive_thermal_uplift.disabled = not has_t + self.derive_orographic_uplift.disabled = not (wind_ready and has_dem) + + if not wind_ready: + self.derive_wind_speed_direction.value = False + self.derive_wind_support_crosswind.value = False + if not has_w: + self.derive_vertical_motion.value = False + if not has_t: + self.derive_thermal_uplift.value = False + if not (wind_ready and has_dem): + self.derive_orographic_uplift.value = False + + def _enforce_split_unique(self, first, second, changed: str, new_values: list): + a = list(first.value or []) + b = list(second.value or []) + if changed == "first": + overlap = set(new_values) & set(b) + if overlap: + second.value = [v for v in b if v not in overlap] + else: + overlap = set(new_values) & set(a) + if overlap: + first.value = [v for v in a if v not in overlap] + self._update_env_info() + self._refresh_preview() + + def _wire_variable_split_guards(self): + self.multilevel_continuous_vars.param.watch( + lambda e: self._enforce_split_unique(self.multilevel_continuous_vars, self.multilevel_categorical_vars, "first", list(e.new or [])), + "value", + ) + self.multilevel_categorical_vars.param.watch( + lambda e: self._enforce_split_unique(self.multilevel_continuous_vars, self.multilevel_categorical_vars, "second", list(e.new or [])), + "value", + ) + self.surface_continuous_vars.param.watch( + lambda e: self._enforce_split_unique(self.surface_continuous_vars, self.surface_categorical_vars, "first", list(e.new or [])), + "value", + ) + self.surface_categorical_vars.param.watch( + lambda e: self._enforce_split_unique(self.surface_continuous_vars, self.surface_categorical_vars, "second", list(e.new or [])), + "value", + ) + + def _selected_multilevel_vars(self) -> List[str]: + return self._unique_values(self.multilevel_continuous_vars.value, self.multilevel_categorical_vars.value) + + def _selected_surface_vars(self) -> List[str]: + return self._unique_values(self.surface_continuous_vars.value, self.surface_categorical_vars.value) + + def _first_or_none(self, values): + values = list(values or []) + return values[0] if values else None + + def _refresh_preview(self) -> None: + lines = [ + "### Preview", + f"- **Movement CSV:** `{self.movement_csv.value or '-'}`", + f"- **Selected IDs:** `{len(self.id_multiselect.value or [])}`", + f"- **Height column:** `{self.height_column.value or '-'}`", + f"- **Geopotential file:** `{self.geopotential_file.value or '-'}`", + f"- **Geopotential variable:** `{self.geopotential_variable.value or '-'}`", + f"- **Multilevel file:** `{self.multilevel_var_file.value or '-'}`", + f"- **Continuous multilevel variables:** `{list(self.multilevel_continuous_vars.value or [])}`", + f"- **Categorical multilevel variables:** `{list(self.multilevel_categorical_vars.value or [])}`", + f"- **Surface file:** `{self.surface_var_file.value or '-'}`", + f"- **Continuous surface variables:** `{list(self.surface_continuous_vars.value or [])}`", + f"- **Categorical surface variables:** `{list(self.surface_categorical_vars.value or [])}`", + f"- **Spatial method:** `{self.spatial_interpolation_method.value}`", + f"- **Nearest grid points:** `{self.control_smoothing.value}`", + f"- **Vertical method:** `{self.vertical_matching_method.value}`", + f"- **Boundary file:** `{self.boundary_path or '-'}`", + f"- **Topography source:** `{self.topography_source.value}`", + "", + "**Derived metrics enabled:**", + f"- Wind speed/direction: `{self.derive_wind_speed_direction.value}`", + f"- Wind support/cross wind: `{self.derive_wind_support_crosswind.value}`", + f"- Vertical motion: `{self.derive_vertical_motion.value}`", + f"- Thermal proxy: `{self.derive_thermal_uplift.value}`", + f"- Orographic uplift: `{self.derive_orographic_uplift.value}`", + ] + self.preview.object = "\n".join(lines) + + def _backend_height_reference(self) -> str: + if self.height_reference.value == "WGS84 ellipsoidal height (Movebank GPS height)": + return "ellipsoidal" + if self.height_reference.value == "Height above ground level (requires DEM)": + return "agl" + return "already_orthometric" + + def _backend_heading_source(self) -> str: + return "column" if self.track_direction_source.value == "Use existing heading column" else "compute" + + def _on_validate(self, event=None) -> None: + errors, warnings = [], [] + + if not self._path_exists(self.movement_csv.value): + errors.append("Movement CSV is missing or does not exist.") + for name, widget in [ + ("ID column", self.id_column), + ("Timestamp column", self.time_column), + ("Latitude column", self.lat_column), + ("Longitude column", self.lon_column), + ("Height column", self.height_column), + ]: + if not widget.value: + errors.append(f"{name} is not selected.") + + if not self._path_exists(self.geopotential_file.value): + errors.append("Geopotential file is required and does not exist.") + if not self.geopotential_variable.value: + errors.append("Geopotential variable is not selected.") + if not self._path_exists(self.multilevel_var_file.value): + errors.append("Annotated multilevel variable file is required and does not exist.") + if not self._selected_multilevel_vars(): + errors.append("No multilevel annotation variables selected.") + + if self.surface_var_file.value and self._path_exists(self.surface_var_file.value) and not self._selected_surface_vars(): + errors.append("Surface variable file is set but no surface variable is selected.") + if self.surface_var_file.value and not self._path_exists(self.surface_var_file.value): + warnings.append("Surface variable file is not selected or is not a file; surface variables will be ignored.") + + if not self.id_multiselect.value: + warnings.append("No individual IDs selected; backend currently processes all rows unless ID filtering is implemented.") + if self.spatial_interpolation_method.value == "Inverse Distance Weighting" and int(self.control_smoothing.value) < 2: + errors.append("IDW requires at least 2 nearest grid points.") + if self.use_surface_as_lower_anchor.value and not self.surface_continuous_vars.value: + warnings.append("Surface anchor is enabled, but no continuous surface variable is selected.") + + if self.topography_source.value != "None" and not self._path_exists(self.dem_file.value): + errors.append("Topography source is selected, but DEM file is missing or does not exist.") + if self.height_reference.value == "Height above ground level (requires DEM)" and not self._path_exists(self.dem_file.value): + errors.append("Height reference is AGL, but DEM file is missing or does not exist.") + if self.height_reference.value == "Already orthometric / MSL-like": + warnings.append("Movement height is assumed to be already comparable to ERA5 geopotential height. No geoid correction will be applied.") + if self.height_reference.value == "WGS84 ellipsoidal height (Movebank GPS height)": + warnings.append("Movement height will be converted to MSL/orthometric height using selected geoid correction mode.") + if self.geopotential_units.value == "m2 s-2" and not self.convert_geopotential_to_height.value: + warnings.append("Geopotential units are m2 s-2, but conversion to height is disabled.") + if self.derive_wind_support_crosswind.value and self.track_direction_source.value == "Use existing heading column" and not self.heading_column.value: + errors.append("Heading column is required when using existing heading column.") + + if errors: + lines = ["### Validation", "**Status:** Issues found", "", *[f"- {e}" for e in errors]] + if warnings: + lines += ["", "**Warnings:**", *[f"- {w}" for w in warnings]] + self.validation.object = "\n".join(lines) + self._append_log(f"Validation completed with {len(errors)} error(s).") + else: + lines = ["### Validation", "**Status:** OK", "", "- UI configuration is sufficient for backend run."] + if warnings: + lines += ["", "**Warnings:**", *[f"- {w}" for w in warnings]] + self.validation.object = "\n".join(lines) + self._append_log("Validation completed successfully.") + self._refresh_preview() + + def _on_run(self, event=None) -> None: + self._on_validate() + if "Issues found" in str(self.validation.object): + self._append_log("Run cancelled because validation found errors.") + return + if run_multidimensional_annotation_from_paths is None: + self._append_log(f"Backend import failed: {BACKEND_IMPORT_ERROR}") + return + + self.run_button.disabled = True + self.run_button.name = "Running multidimensional annotation..." + try: + output_csv = Path(str(self.output_csv.value)).expanduser() + output_csv.parent.mkdir(parents=True, exist_ok=True) + + multilevel_cont = list(self.multilevel_continuous_vars.value or []) + multilevel_cat = list(self.multilevel_categorical_vars.value or []) + surface_cont = list(self.surface_continuous_vars.value or []) + surface_cat = list(self.surface_categorical_vars.value or []) + surface_file = ( + self._optional_path(self.surface_var_file.value) + if (surface_cont or surface_cat) + else None + ) + + self._append_log("Starting multidimensional annotation.") + + result = run_multidimensional_annotation_from_paths( + movement_csv=self._optional_path(self.movement_csv.value), + output_csv=str(output_csv), + id_col=self.id_column.value, + selected_ids=list(self.id_multiselect.value) if self.id_multiselect.value else None, + time_col=self.time_column.value, + lat_col=self.lat_column.value, + lon_col=self.lon_column.value, + boundary_path=self.boundary_path or None, + height_col=self.height_column.value, + geopotential_file=self._optional_path(self.geopotential_file.value), + geopotential_variable=self.geopotential_variable.value, + geopotential_units=self.geopotential_units.value, + convert_geopotential_to_height=bool(self.convert_geopotential_to_height.value), + nc_time_var=self.nc_time_var.value or None, + nc_lat_var=self.nc_lat_var.value or None, + nc_lon_var=self.nc_lon_var.value or None, + nc_level_var=self.nc_level_var.value or None, + multilevel_var_file=self._optional_path(self.multilevel_var_file.value), + surface_var_file=surface_file, + multilevel_continuous_vars=multilevel_cont, + multilevel_categorical_vars=multilevel_cat, + surface_continuous_vars=surface_cont, + surface_categorical_vars=surface_cat, + dem_file=self._optional_path(self.dem_file.value), + save_per_individual=bool(self.save_per_individual.value), + keep_diagnostics=bool(self.keep_diagnostics.value), + vertical_matching_method=self.vertical_matching_method.value, + height_reference=self._backend_height_reference(), + geoid_mode=self.geoid_mode.value, + constant_geoid_undulation_m=float(self.constant_geoid_undulation_m.value or 0.0), + u_file=self._optional_path(self.u_file.value), + u_variable=self.u_variable.value if self._path_exists(self.u_file.value) else None, + v_file=self._optional_path(self.v_file.value), + v_variable=self.v_variable.value if self._path_exists(self.v_file.value) else None, + w_file=self._optional_path(self.w_file.value), + w_variable=self.w_variable.value if self._path_exists(self.w_file.value) else None, + temperature_file=self._optional_path(self.temperature_file.value), + temperature_variable=self.temperature_variable.value if self._path_exists(self.temperature_file.value) else None, + derive_wind_speed_direction=bool(self.derive_wind_speed_direction.value), + derive_wind_support_crosswind=bool(self.derive_wind_support_crosswind.value), + derive_vertical_motion=bool(self.derive_vertical_motion.value), + derive_thermal_proxy=bool(self.derive_thermal_uplift.value), + smoothing_k=int(self.control_smoothing.value), + derive_orographic_uplift=bool(self.derive_orographic_uplift.value), + heading_col=self.heading_column.value, + heading_source=self._backend_heading_source(), + ) + + n_rows = len(result) if result is not None else 0 + n_cols = len(result.columns) if result is not None else 0 + self._append_log(f"Annotation completed: {n_rows} row(s), {n_cols} column(s).") + self._append_log(f"Output saved to: {output_csv}") + preview_cols = list(result.columns[:20]) if result is not None else [] + self.preview.object = "\n".join([ + "### Preview", + f"- **Output CSV:** `{output_csv}`", + f"- **Rows:** `{n_rows}`", + f"- **Columns:** `{n_cols}`", + "", + "**First output columns:**", + *[f"- `{col}`" for col in preview_cols], + ]) + except Exception as exc: + logger.exception("Multidimensional annotation failed.") + self._append_log(f"Annotation failed: {exc}") + finally: + self.run_button.disabled = False + self.run_button.name = "Run multidimensional annotation" + + def _card(self, title: str, *items): + return pn.Card( + pn.Column(*items, sizing_mode="stretch_width"), + title=title, + collapsible=True, + collapsed=False, + sizing_mode="stretch_width", + margin=0, + styles={"margin": "0px", "border-radius": "0px"}, + ) + + def view(self): + COL_H = 3500 + col1 = pn.Column( + self._card( + "1. Movement data", + self.movement_csv, + self.load_movement_button, + self.taxon_multiselect, + self.id_multiselect, + self.id_column, + self.time_column, + self.lat_column, + self.lon_column, + self.height_column, + self.height_units, + self.height_reference, + self.geoid_mode, + self.constant_geoid_undulation_m, + self.movement_info, + ), + self._card( + "2. Vertical reference / geopotential", + self.geopotential_file, + self.scan_geopotential_button, + self.geopotential_variable, + self.nc_time_var, + self.nc_lat_var, + self.nc_lon_var, + self.nc_level_var, + self.geopotential_units, + self.convert_geopotential_to_height, + self.gravity_constant, + ), + sizing_mode="stretch_width", + height=COL_H, + margin=0, + styles={"gap": "0px", "display": "flex", "flex-direction": "column"}, + ) + + col2 = pn.Column( + self._card( + "3. Annotation variables", + self.multilevel_var_file, + self.scan_multilevel_button, + self.multilevel_continuous_vars, + self.multilevel_categorical_vars, + self.surface_var_file, + self.scan_surface_button, + self.surface_continuous_vars, + self.surface_categorical_vars, + self.env_info, + ), + self._card( + "4. Optional atmospheric components", + self.u_file, + self.u_variable, + self.v_file, + self.v_variable, + self.w_file, + self.w_variable, + self.temperature_file, + self.temperature_variable, + self.scan_optional_components_button, + ), + sizing_mode="stretch_width", + height=COL_H, + margin=0, + styles={"gap": "0px", "display": "flex", "flex-direction": "column"}, + ) + + col3 = pn.Column( + self._card("5. Boundary data", self.boundary_file, pn.Row(self.load_boundary_button, self.reset_boundary_button), self.boundary_info), + self._card( + "6. Interpolation / vertical matching", + self.spatial_interpolation_method, + self.control_smoothing, + self.vertical_matching_method, + self.use_surface_as_lower_anchor, + self.surface_anchor_height_agl_m, + ), + self._card("7. Topography", self.topography_source, self.dem_file, self.dem_units, self.dem_reference), + self._card( + "8. Derived metrics", + self.derive_wind_speed_direction, + self.derive_wind_support_crosswind, + self.track_direction_source, + self.heading_column, + self.derive_vertical_motion, + self.derive_thermal_uplift, + self.derive_orographic_uplift, + ), + self._card( + "9. Output", + self.output_csv, + self.save_per_individual, + self.keep_diagnostics, + self.validate_button, + self.run_button, + ), + sizing_mode="stretch_width", + height=COL_H, + margin=0, + styles={"gap": "0px", "display": "flex", "flex-direction": "column"}, + ) + + return pn.Column( + "# Multidimensional Annotation Engine App (DEMO)", + pn.GridBox(col1, col2, col3, ncols=3, sizing_mode="stretch_width", height=COL_H, scroll=True), + pn.Row(self.preview, self.validation, self.log, sizing_mode="stretch_width"), + sizing_mode="stretch_width", + ) + + +@register_view(ext_args=["floatpanel"]) +def view(): + app = Multidimensional_Annotation_App() + template = DEFAULT_TEMPLATE(main=[app.view()], sidebar=[]) + return template + + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/app/apps/nc_builder_app.py b/ecodata/app/apps/nc_builder_app.py new file mode 100644 index 0000000..398dc81 --- /dev/null +++ b/ecodata/app/apps/nc_builder_app.py @@ -0,0 +1,855 @@ +import logging +from pathlib import Path +from typing import Dict, List, Optional + +import panel as pn +import pandas as pd + +from ecodata.app.config import DEFAULT_TEMPLATE +from ecodata.app.models import FileSelector +from ecodata.panel_utils import register_view + +logger = logging.getLogger(__file__) + +BACKEND_IMPORT_ERROR = None + +try: + from ecodata.nc_builder_functions import ( + NCBuildConfig, + build_standardized_netcdf, + scan_netcdf_files, + validate_build_config, + ) +except Exception as exc: + BACKEND_IMPORT_ERROR = str(exc) + NCBuildConfig = None + build_standardized_netcdf = None + scan_netcdf_files = None + validate_build_config = None + + +class NCBuilder_App: + """ + UI for building a standardized CF-style NetCDF file from multiple ERA5 or generic NetCDF files. + """ + + def __init__(self): + self.name = "NetCDF Builder" + self._scanned_files: List[Path] = [] + self._detected_time_min: Optional[pd.Timestamp] = None + self._detected_time_max: Optional[pd.Timestamp] = None + + # 1. Input files + self.input_folder = FileSelector( + name="Input folder", + constrain_path=False, + expanded=True, + size=10, + ) + + self.input_files = pn.widgets.MultiSelect( + name="Select files from current folder", + options={}, + value=[], + size=12, + sizing_mode="stretch_width", + ) + + self.combine_mode = pn.widgets.RadioButtonGroup( + name="Combine mode", + options=["By time", "By level", "By time and level"], + value="By time and level", + button_type="primary", + sizing_mode="stretch_width", + ) + + # 2. Variable and coordinate mapping + self.target_variable = pn.widgets.MultiSelect( + name="Target variable(s)", + options=[], + value=[], + size=8, + sizing_mode="stretch_width", + ) + self.time_variable = pn.widgets.Select(name="Time variable", options=[], value=None, sizing_mode="stretch_width") + self.lat_variable = pn.widgets.Select(name="Latitude variable", options=[], value=None, sizing_mode="stretch_width") + self.lon_variable = pn.widgets.Select(name="Longitude variable", options=[], value=None, sizing_mode="stretch_width") + self.level_variable = pn.widgets.Select(name="Vertical / level variable", options=["None"], value="None", sizing_mode="stretch_width") + + self.output_variable_name = pn.widgets.TextInput( + name="Output variable name", + placeholder="Example: temperature", + value="", + sizing_mode="stretch_width", + ) + self.output_level_coord_name = pn.widgets.TextInput( + name="Output level coordinate name", + value="level", + sizing_mode="stretch_width", + ) + self.level_units = pn.widgets.Select( + name="Level units", + options=["hPa", "m", "Pa", "model_level", "custom"], + value="hPa", + sizing_mode="stretch_width", + ) + self.level_units_custom = pn.widgets.TextInput( + name="Custom level units", + placeholder="Example: sigma, hybrid_level, depth_m", + value="", + disabled=True, + sizing_mode="stretch_width", + ) + + self.cf_note = pn.pane.Markdown( + ( + "**Standard output coordinate names:** `time`, `lat`, `lon`, `level` \n" + "The backend writes basic CF-style metadata for coordinate attributes." + ), + sizing_mode="stretch_width", + ) + + # 3. Level detection + self.level_source = pn.widgets.Select( + name="Level source", + options=["From NetCDF coordinate", "From filename", "Manual table"], + value="From NetCDF coordinate", + sizing_mode="stretch_width", + ) + self.level_regex = pn.widgets.TextInput( + name="Level regex", + value=r"level(\d+)", + placeholder=r"Example: level(\d+)", + sizing_mode="stretch_width", + ) + self.level_table_path = pn.widgets.TextInput( + name="Level table file", + placeholder="CSV with columns: name, level", + value="", + sizing_mode="stretch_width", + ) + self.level_table_note = pn.pane.Markdown( + ( + "**Manual level table format:** CSV with columns `name` and `level`. \n" + "`name` should match the input file name or a unique part of it." + ), + sizing_mode="stretch_width", + ) + + # 4. Time detection + self.time_source = pn.widgets.Select( + name="Time source", + options=["From NetCDF time coordinate", "From filename", "Manual table"], + value="From NetCDF time coordinate", + sizing_mode="stretch_width", + ) + self.time_regex = pn.widgets.TextInput( + name="Time regex", + value=r"(\d{8})", + placeholder=r"Example: (\d{8}) for YYYYMMDD", + sizing_mode="stretch_width", + ) + self.time_format = pn.widgets.TextInput( + name="Time format", + value="%Y%m%d", + placeholder="Example: %Y%m%d or %Y-%m-%d_%H", + sizing_mode="stretch_width", + ) + self.time_table_path = pn.widgets.TextInput( + name="Time table file", + placeholder="CSV with columns: name, DateTime", + value="", + sizing_mode="stretch_width", + ) + self.time_table_note = pn.pane.Markdown( + ( + "**Manual time table format:** CSV with columns `name` and `DateTime`. \n" + "`name` should match the input file name or a unique part of it. \n" + "`DateTime` should be parseable by pandas, e.g. `1994-01-01 00:00:00`." + ), + sizing_mode="stretch_width", + ) + + # 5. Spatial subset + self.use_bbox = pn.widgets.Checkbox(name="Bounding box", value=False, sizing_mode="stretch_width") + self.bbox_south = pn.widgets.FloatInput(name="South", value=None, step=0.25) + self.bbox_north = pn.widgets.FloatInput(name="North", value=None, step=0.25) + self.bbox_west = pn.widgets.FloatInput(name="West", value=None, step=0.25) + self.bbox_east = pn.widgets.FloatInput(name="East", value=None, step=0.25) + self.bbox_note = pn.pane.Markdown( + "If the bounding box is not enabled, the original spatial extent is preserved.", + sizing_mode="stretch_width", + ) + + # 6. Time subset + self.detected_time_range = pn.pane.Markdown("**Detected time range:** not scanned yet", sizing_mode="stretch_width") + self.start_time = pn.widgets.DatetimePicker(name="Start time", value=None, sizing_mode="stretch_width") + self.end_time = pn.widgets.DatetimePicker(name="End time", value=None, sizing_mode="stretch_width") + self.time_subset_note = pn.pane.Markdown( + ( + "If input files do not contain a time coordinate, use **Time source = From filename** " + "or **Manual table**. If no time information is provided, all files will be used." + ), + sizing_mode="stretch_width", + ) + + # 7. Output settings + self.output_folder = pn.widgets.TextInput( + name="Output folder", + placeholder="Path to output folder", + value=str(Path.home() / "Downloads"), + sizing_mode="stretch_width", + ) + self.output_filename = pn.widgets.TextInput( + name="Output filename", + value="era5_standardized_temperature.nc", + sizing_mode="stretch_width", + ) + self.output_mode = pn.widgets.Select( + name="Output mode", + options=["Single NetCDF file"], + value="Single NetCDF file", + sizing_mode="stretch_width", + ) + self.use_dask_chunks = pn.widgets.Checkbox(name="Use chunking when reading", value=False, sizing_mode="stretch_width") + self.chunking_mode = pn.widgets.Select(name="Chunking mode", options=["auto", "manual"], value="auto", sizing_mode="stretch_width") + self.chunk_time = pn.widgets.IntInput(name="time chunk", value=24, start=1, step=1, disabled=True) + self.chunk_level = pn.widgets.IntInput(name="level chunk", value=1, start=1, step=1, disabled=True) + self.chunk_lat = pn.widgets.IntInput(name="lat chunk", value=200, start=1, step=10, disabled=True) + self.chunk_lon = pn.widgets.IntInput(name="lon chunk", value=200, start=1, step=10, disabled=True) + self.enable_compression = pn.widgets.Checkbox(name="Enable NetCDF compression", value=True, sizing_mode="stretch_width") + + # Preview / validation / log + self.preview = pn.pane.Markdown( + "### Preview\nNo files scanned yet.", + sizing_mode="stretch_width", + styles={"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"}, + ) + self.validation_panel = pn.pane.Markdown( + "### Validation\nNot validated yet.", + sizing_mode="stretch_width", + styles={"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"}, + ) + self.log = pn.pane.Markdown( + "### Log\nReady.", + sizing_mode="stretch_width", + styles={"border": "1px solid #ddd", "padding": "10px", "border-radius": "6px"}, + ) + + # Buttons + self.load_files_button = pn.widgets.Button( + name="Load file list", + button_type="primary", + sizing_mode="stretch_width", + ) + + self.scan_variables_button = pn.widgets.Button( + name="Scan variables", + button_type="primary", + sizing_mode="stretch_width", + ) + self.validate_button = pn.widgets.Button( + name="Validate", + button_type="primary", + sizing_mode="stretch_width", + ) + + self.build_button = pn.widgets.Button( + name="Build standardized NetCDF", + button_type="primary", + sizing_mode="stretch_width", + ) + + self.load_files_button.on_click(self._on_load_file_list) + self.scan_variables_button.on_click(self._on_scan_variables) + self.validate_button.on_click(self._on_validate) + self.build_button.on_click(self._on_build) + self.target_variable.param.watch(self._on_target_variables_changed, "value") + self.level_units.param.watch(self._update_widget_states, "value") + self.level_source.param.watch(self._update_widget_states, "value") + self.time_source.param.watch(self._update_widget_states, "value") + self.use_bbox.param.watch(self._update_widget_states, "value") + self.chunking_mode.param.watch(self._update_widget_states, "value") + self.use_dask_chunks.param.watch(self._update_widget_states, "value") + self.combine_mode.param.watch(self._update_widget_states, "value") + self._update_widget_states() + + def _append_log(self, message: str) -> None: + old = self.log.object or "### Log\n" + if old.strip() == "### Log\nReady.": + old = "### Log\n" + self.log.object = old + f"\n- {message}" + + def _current_input_directory(self) -> Optional[Path]: + """ + Return the input folder represented by the custom FileSelector. + + The custom selector is used only to define the folder. + If the selector value is a file, NCBuilder uses its parent folder. + The actual file list for scan/validate/build is controlled by self.input_files. + """ + candidates = [ + getattr(self.input_folder, "value", None), + getattr(self.input_folder, "directory", None), + ] + + for raw_value in candidates: + if not raw_value: + continue + + if isinstance(raw_value, (list, tuple)): + if not raw_value: + continue + raw_value = raw_value[0] + + path = Path(str(raw_value)).expanduser() + + if path.exists() and path.is_file(): + return path.parent + + if path.exists() and path.is_dir(): + return path + + return None + + + def _list_netcdf_files_in_selected_folder(self) -> List[Path]: + """ + List supported NetCDF-like files in the current input folder. + """ + folder = self._current_input_directory() + if folder is None: + return [] + + extensions = {".nc", ".nc4", ".cdf", ".netcdf"} + + files = [ + p for p in folder.iterdir() + if p.is_file() and p.suffix.lower() in extensions + ] + + return sorted(files, key=lambda p: p.name.lower()) + + + def _refresh_input_file_options(self) -> None: + """ + Load all supported NetCDF files from the current FileSelector directory + into the MultiSelect. + + This method controls what files are visible in the UI. + It does not decide what files will be passed to the backend. + """ + files = self._list_netcdf_files_in_selected_folder() + + options = { + f.name: str(f) + for f in files + if f.exists() and f.is_file() + } + + self.input_files.options = options + + # When a new folder is opened, select all detected files by default. + # The user can then deselect files manually. + self.input_files.value = list(options.values()) + + + def _on_load_file_list(self, event=None) -> None: + """ + Load all supported NetCDF files from the current custom FileSelector folder + into the MultiSelect. + + The custom FileSelector is used only to define the folder. + The actual files passed to scan/validate/build are controlled by + self.input_files.value. + """ + self.log.object = "### Log\n" + + folder = self._current_input_directory() + + if folder is None: + selector_value = getattr(self.input_folder, "value", None) + selector_directory = getattr(self.input_folder, "directory", None) + + self.input_files.options = {} + self.input_files.value = [] + + self.preview.object = ( + "### Preview\n" + "No valid input folder was detected from the custom selector.\n\n" + f"- `FileSelector.value`: `{selector_value}`\n" + f"- `FileSelector.directory`: `{selector_directory}`\n\n" + "Open the target folder or click any file inside that folder, then press **Load file list**." + ) + self._append_log("No valid input folder detected from FileSelector.") + return + + self._refresh_input_file_options() + + n_files = len(self.input_files.options or {}) + + self.preview.object = ( + "### Preview\n" + f"- **Input folder:** `{folder}`\n" + f"- **Files loaded into Select files from current folder:** {n_files}\n" + "- Deselect files that should not be scanned or built." + ) + + if n_files == 0: + self._append_log( + f"No supported NetCDF files found in `{folder}`. " + "Expected extensions: .nc, .nc4, .cdf, .netcdf." + ) + else: + self._append_log(f"Loaded {n_files} NetCDF file(s) from `{folder}`.") + + def _collect_input_files(self) -> List[Path]: + """ + Collect only files explicitly selected in the MultiSelect. + + MultiSelect options may contain all files from the folder, + but only MultiSelect value is passed to scan/validate/build. + """ + selected_values = list(self.input_files.value or []) + + files: List[Path] = [] + + for value in selected_values: + path = Path(str(value)).expanduser() + if path.exists() and path.is_file(): + files.append(path) + + unique_files: List[Path] = [] + seen = set() + + for f in files: + key = str(f.resolve()) if f.exists() else str(f) + if key not in seen: + seen.add(key) + unique_files.append(f) + + return unique_files + + def _sync_selected_files(self) -> List[Path]: + """ + Synchronize backend file list with the current MultiSelect selection. + """ + files = self._collect_input_files() + self._scanned_files = [ + Path(f).expanduser() + for f in files + if Path(f).expanduser().exists() + ] + return self._scanned_files + + def _on_target_variables_changed(self, event=None) -> None: + """ + Update output-name behaviour depending on single-variable or multi-variable mode. + + In multi-variable mode, source variable names are preserved, so the single + output variable name field is disabled. + """ + selected_targets = list(self.target_variable.value or []) + + if len(selected_targets) == 1: + self.output_variable_name.disabled = False + if not self.output_variable_name.value: + self.output_variable_name.value = selected_targets[0] + elif len(selected_targets) > 1: + self.output_variable_name.value = "" + self.output_variable_name.disabled = True + else: + self.output_variable_name.disabled = False + + def _manual_chunks_dict(self) -> Dict[str, int]: + return { + "time": int(self.chunk_time.value), + "level": int(self.chunk_level.value), + "lat": int(self.chunk_lat.value), + "lon": int(self.chunk_lon.value), + } + + def _update_widget_states(self, *_events) -> None: + self.level_units_custom.disabled = self.level_units.value != "custom" + + self.level_variable.disabled = self.level_source.value != "From NetCDF coordinate" + self.level_regex.disabled = self.level_source.value != "From filename" + self.level_table_path.disabled = self.level_source.value != "Manual table" + + self.time_variable.disabled = self.time_source.value != "From NetCDF time coordinate" + self.time_regex.disabled = self.time_source.value != "From filename" + self.time_format.disabled = self.time_source.value != "From filename" + self.time_table_path.disabled = self.time_source.value != "Manual table" + + bbox_disabled = not self.use_bbox.value + for widget in (self.bbox_south, self.bbox_north, self.bbox_west, self.bbox_east): + widget.disabled = bbox_disabled + + manual_chunks = self.use_dask_chunks.value and self.chunking_mode.value == "manual" + self.chunking_mode.disabled = not self.use_dask_chunks.value + for widget in (self.chunk_time, self.chunk_level, self.chunk_lat, self.chunk_lon): + widget.disabled = not manual_chunks + # In "By time" mode, the selected files already define the time range. + # Avoid applying an additional pandas-based time subset, especially for + # cftime calendars such as Julian/noleap/360_day. + time_subset_disabled = self.combine_mode.value == "By time" + + self.start_time.disabled = time_subset_disabled + self.end_time.disabled = time_subset_disabled + + if time_subset_disabled: + self.time_subset_note.object = ( + "In **By time** mode, time subsetting is disabled. " + "Select the required files in **Select files from current folder** instead. " + "The detected time range is shown for information only." + ) + else: + self.time_subset_note.object = ( + "If input files do not contain a time coordinate, use **Time source = From filename** " + "or **Manual table**. If no time information is provided, all files will be used." + ) + def _make_bbox_config(self) -> Optional[Dict[str, float]]: + if not self.use_bbox.value: + return None + return { + "south": float(self.bbox_south.value), + "north": float(self.bbox_north.value), + "west": float(self.bbox_west.value), + "east": float(self.bbox_east.value), + } + + def _make_output_path(self) -> str: + folder = Path(self.output_folder.value or ".").expanduser() + filename = self.output_filename.value or "standardized_output.nc" + return str(folder / filename) + + def _make_build_config(self): + if NCBuildConfig is None: + raise RuntimeError(f"NCBuilder backend functions are not available. Import error: {BACKEND_IMPORT_ERROR}") + + manual_chunks = None + if self.use_dask_chunks.value and self.chunking_mode.value == "manual": + manual_chunks = self._manual_chunks_dict() + + level_units = self.level_units_custom.value if self.level_units.value == "custom" else self.level_units.value + + level_variable = self.level_variable.value + if level_variable == "None": + level_variable = None + target_variables = list(self.target_variable.value or []) + target_variable = target_variables[0] if target_variables else None + self._sync_selected_files() + + if self.combine_mode.value == "By time": + start_time = None + end_time = None + else: + start_time = str(self.start_time.value) if self.start_time.value else None + end_time = str(self.end_time.value) if self.end_time.value else None + + return NCBuildConfig( + files=[str(p) for p in self._scanned_files], + combine_mode=self.combine_mode.value, + target_variable=target_variable, + output_variable_name=self.output_variable_name.value or target_variable, + target_variables=target_variables, + lat_variable=self.lat_variable.value, + lon_variable=self.lon_variable.value, + time_source=self.time_source.value, + time_variable=self.time_variable.value, + time_regex=self.time_regex.value, + time_format=self.time_format.value, + time_table_path=self.time_table_path.value or None, + level_source=self.level_source.value, + level_variable=level_variable, + level_regex=self.level_regex.value, + level_table_path=self.level_table_path.value or None, + output_level_coord_name=self.output_level_coord_name.value or "level", + level_units=level_units, + bbox=self._make_bbox_config(), + start_time=start_time, + end_time=end_time, + output_path=self._make_output_path(), + use_dask_chunks=bool(self.use_dask_chunks.value), + chunking_mode=self.chunking_mode.value, + manual_chunks=manual_chunks, + enable_compression=bool(self.enable_compression.value), + convert_longitude_to_180=True, + open_engine="auto", + use_modis_time_encoding=True, + ) + + def _on_scan_variables(self, event=None) -> None: + self.log.object = "### Log\n" + + self._sync_selected_files() + + if not self._scanned_files: + self.preview.object = ( + "### Preview\n" + "No NetCDF files are selected. First click **Load file list**, " + "then keep one or more files selected in **Select files from current folder**." + ) + self._append_log("No NetCDF files selected.") + return + + self._append_log(f"Found {len(self._scanned_files)} existing NetCDF file(s).") + + if scan_netcdf_files is None: + self.preview.object = ( + "### Preview\nBackend scan function is not available.\n\n" + f"Import error: `{BACKEND_IMPORT_ERROR}`" + ) + self._append_log("Backend scan function is not available.") + return + + try: + meta = scan_netcdf_files( + self._scanned_files, + max_scan=10, + use_dask_chunks=bool(self.use_dask_chunks.value), + chunking_mode=self.chunking_mode.value, + manual_chunks=self._manual_chunks_dict() if self.chunking_mode.value == "manual" else None, + ) + except Exception as exc: + self.preview.object = f"### Preview\nScan failed: `{exc}`" + self._append_log(f"Scan failed: {exc}") + return + + variables = meta.get("variables", []) + all_names = meta.get("all_names", []) + + self.target_variable.options = variables + self.target_variable.value = [variables[0]] if variables else [] + + self.time_variable.options = all_names + self.lat_variable.options = all_names + self.lon_variable.options = all_names + self.level_variable.options = ["None"] + all_names + + self.time_variable.value = meta.get("suggested_time") + self.lat_variable.value = meta.get("suggested_lat") + self.lon_variable.value = meta.get("suggested_lon") + suggested_level = meta.get("suggested_level") + self.level_variable.value = suggested_level if suggested_level else "None" + + if not self.time_variable.value: + self.time_source.value = "From filename" + self._append_log("No obvious time variable detected. Time source was set to 'From filename'.") + + selected_targets = list(self.target_variable.value or []) + if selected_targets: + first_target = selected_targets[0] + + if len(selected_targets) == 1: + self.output_variable_name.value = str(first_target) + if not self.output_filename.value or self.output_filename.value == "era5_standardized_temperature.nc": + self.output_filename.value = f"standardized_{first_target}.nc" + else: + # In multi-variable mode the backend keeps original variable names. + # The output_variable_name field is only meaningful for single-variable mode. + self.output_variable_name.value = "" + if not self.output_filename.value or self.output_filename.value == "era5_standardized_temperature.nc": + self.output_filename.value = "standardized_multivariable.nc" + + self._detected_time_min = pd.to_datetime(meta.get("time_min")) if meta.get("time_min") else None + self._detected_time_max = pd.to_datetime(meta.get("time_max")) if meta.get("time_max") else None + + if self._detected_time_min is not None and self._detected_time_max is not None: + self.start_time.value = self._detected_time_min.to_pydatetime() + self.end_time.value = self._detected_time_max.to_pydatetime() + self.detected_time_range.object = ( + f"**Detected time range:** {self._detected_time_min} → {self._detected_time_max}" + ) + else: + self.detected_time_range.object = "**Detected time range:** not detected from NetCDF coordinates" + + warnings = meta.get("warnings", []) + preview_lines = [ + "### Preview", + f"- **Candidate files:** {len(self._scanned_files)}", + f"- **Scanned files:** {meta.get('scanned_count', 0)}", + f"- **Detected variables:** {', '.join(variables) if variables else '-'}", + f"- **Detected coordinates:** {', '.join(meta.get('coords', [])) if meta.get('coords') else '-'}", + f"- **Detected dimensions:** {', '.join(meta.get('dims', [])) if meta.get('dims') else '-'}", + f"- **Combine mode:** {self.combine_mode.value}", + f"- **Target variable(s):** {', '.join(self.target_variable.value) if self.target_variable.value else '-'}", + f"- **Time variable:** {self.time_variable.value or '-'}", + f"- **Latitude variable:** {self.lat_variable.value or '-'}", + f"- **Longitude variable:** {self.lon_variable.value or '-'}", + f"- **Level variable:** {self.level_variable.value or 'None'}", + f"- **Time source:** {self.time_source.value}", + f"- **Level source:** {self.level_source.value}", + ] + if warnings: + preview_lines.append("\n**Warnings:**") + preview_lines.extend([f"- {w}" for w in warnings]) + self.preview.object = "\n".join(preview_lines) + self._append_log("Scan complete.") + + def _on_validate(self, event=None) -> None: + if validate_build_config is None: + self.validation_panel.object = ( + "### Validation\nBackend validation function is not available.\n\n" + f"Import error: `{BACKEND_IMPORT_ERROR}`" + ) + self._append_log("Backend validation function is not available.") + return + + try: + config = self._make_build_config() + ok, errors, warnings = validate_build_config(config) + except Exception as exc: + self.validation_panel.object = f"### Validation\nValidation setup failed: `{exc}`" + self._append_log(f"Validation setup failed: {exc}") + return + + if ok: + lines = [ + "### Validation", + "**Status:** OK", + "", + "- UI settings are sufficient for the backend build step.", + "- Backend will also check grid compatibility during build.", + ] + if warnings: + lines.append("") + lines.append("**Warnings:**") + lines.extend([f"- {w}" for w in warnings]) + self.validation_panel.object = "\n".join(lines) + self._append_log("Validation completed successfully.") + else: + lines = ["### Validation", "**Status:** Issues found", ""] + lines.extend([f"- {e}" for e in errors]) + if warnings: + lines.append("") + lines.append("**Warnings:**") + lines.extend([f"- {w}" for w in warnings]) + self.validation_panel.object = "\n".join(lines) + self._append_log(f"Validation completed with {len(errors)} error(s).") + + def _on_build(self, event=None) -> None: + if build_standardized_netcdf is None: + self._append_log(f"Backend build function is not available. Import error: {BACKEND_IMPORT_ERROR}") + return + + try: + config = self._make_build_config() + ok, errors, warnings = validate_build_config(config) + if not ok: + self.validation_panel.object = ( + "### Validation\n**Status:** Issues found\n\n" + + "\n".join(f"- {e}" for e in errors) + ) + self._append_log("Build stopped because validation failed.") + return + + self._append_log("Build started.") + manifest = build_standardized_netcdf(config) + self._append_log(f"Build complete: `{manifest['output_path']}`") + self._append_log(f"Manifest saved: `{manifest['manifest_path']}`") + + self.preview.object = ( + "### Build result\n" + f"- **Output file:** `{manifest['output_path']}`\n" + f"- **Manifest:** `{manifest['manifest_path']}`\n" + f"- **Output dimensions:** `{manifest['output_dims']}`\n" + f"- **Output variables:** {', '.join(manifest['output_variables'])}\n" + f"- **Output coordinates:** {', '.join(manifest['output_coords'])}" + ) + except Exception as exc: + self._append_log(f"Build failed: {exc}") + self.validation_panel.object = f"### Validation / Build error\n`{exc}`" + + def view(self): + input_col = pn.Column( + "## 1. Input files", + self.input_folder, + self.load_files_button, + self.input_files, + self.combine_mode, + self.scan_variables_button, + sizing_mode="stretch_width", + ) + + mapping_col = pn.Column( + "## 2. Variables, coordinates and time", + self.target_variable, + self.time_variable, + self.lat_variable, + self.lon_variable, + self.level_variable, + pn.layout.Divider(), + self.output_variable_name, + self.output_level_coord_name, + self.level_units, + self.level_units_custom, + self.cf_note, + pn.layout.Divider(), + "## 3. Level detection", + self.level_source, + self.level_regex, + self.level_table_path, + self.level_table_note, + pn.layout.Divider(), + "## 4. Time detection", + self.time_source, + self.time_regex, + self.time_format, + self.time_table_path, + self.time_table_note, + sizing_mode="stretch_width", + ) + + subset_output_col = pn.Column( + "## 5. Spatial subset", + self.use_bbox, + pn.Row(self.bbox_south, self.bbox_north, sizing_mode="stretch_width"), + pn.Row(self.bbox_west, self.bbox_east, sizing_mode="stretch_width"), + self.bbox_note, + pn.layout.Divider(), + "## 6. Time subset", + self.detected_time_range, + self.start_time, + self.end_time, + self.time_subset_note, + pn.layout.Divider(), + "## 7. Output settings", + self.output_folder, + self.output_filename, + self.output_mode, + self.use_dask_chunks, + self.chunking_mode, + pn.Row(self.chunk_time, self.chunk_level, sizing_mode="stretch_width"), + pn.Row(self.chunk_lat, self.chunk_lon, sizing_mode="stretch_width"), + self.enable_compression, + self.validate_button, + self.build_button, + sizing_mode="stretch_width", + ) + + main = pn.Column( + "# NetCDF Builder", + pn.Row(input_col, mapping_col, subset_output_col, sizing_mode="stretch_width"), + pn.Row(self.preview, self.validation_panel, self.log, sizing_mode="stretch_width"), + sizing_mode="stretch_width", + ) + return main + + +@register_view(ext_args=["floatpanel"]) +def view(): + app = NCBuilder_App() + template = DEFAULT_TEMPLATE( + main=[app.view()], + sidebar=[], + ) + return template + + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/app/apps/presence_data_preparation_app.py b/ecodata/app/apps/presence_data_preparation_app.py new file mode 100644 index 0000000..ddd3f45 --- /dev/null +++ b/ecodata/app/apps/presence_data_preparation_app.py @@ -0,0 +1,767 @@ +""" +eBird data preparation app for ECODATA-Prepare. + +Provides UI to: +- Select EBD + Sampling Event tables using local file selectors +- Select region polygon using a local file selector, or use a bounding box +- Configure vetting filters +- Aggregate by time and export files usable by ECODATA-Animate +""" + +from __future__ import annotations + +import datetime as dt +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + +import panel as pn +import pandas as pd +import numpy as np + +from ecodata.app.config import DEFAULT_TEMPLATE +from ecodata.app.models import FileSelector +from ecodata.panel_utils import register_view +from ecodata.presence_functions import ( + VettingOptions, + AggregationOptions, + aggregate_ebird_to_files, + export_tracks_from_aggregated_counts, + read_species_from_agg_counts, +) + + +def _ensure_dir(path: str) -> str: + """Create directory if missing and return absolute path.""" + path = os.path.abspath(path) + os.makedirs(path, exist_ok=True) + return path + + +def _safe_filename(s: str, default: str = "output") -> str: + """Return filesystem-safe filename.""" + s = (s or "").strip() + s = re.sub(r"[^A-Za-z0-9_.-]+", "_", s) + return s if s else default + + +@dataclass +class OutputPaths: + """Container for output file paths.""" + out_dir: str + agg_counts_csv: str + agg_presence_csv: str + tracks_csv: str + manifest_json: str + + +class EbirdPrepareApp: + """Panel app for preparing eBird data for ECODATA-Animate.""" + + def __init__(self): + self._paths: Optional[OutputPaths] = None + self._region_id: str = "region_1" + + def make_file_selector(name: str, file_pattern: str = "*") -> FileSelector: + return FileSelector( + name=name, + directory=str(Path.home()), + file_pattern=file_pattern, + only_files=True, + constrain_path=False, + expanded=True, + size=10, + sizing_mode="stretch_width", + ) + + self.source_mode = pn.widgets.RadioButtonGroup( + name="Data source", + options=[ + "EBD file", + "Sampling Event file", + "Region polygon ", + ], + value="EBD file", + button_type="primary", + ) + + self.ebd_path = make_file_selector( + "EBD local path", + "*", + ) + self.sampling_path = make_file_selector( + "Sampling local path", + "*", + ) + self.polygon_path = make_file_selector( + "Region polygon local path (shapefile or GeoJSON)", + "*", + ) + + self.spatial_filter_mode = pn.widgets.RadioButtonGroup( + name="Spatial filter", + options=["Region polygon", "Bounding box"], + value="Region polygon", + button_type="primary", + ) + + self.bbox_west = pn.widgets.FloatInput( + name="West / min longitude", + value=-88.5, + step=0.1, + ) + self.bbox_south = pn.widgets.FloatInput( + name="South / min latitude", + value=30.1, + step=0.1, + ) + self.bbox_east = pn.widgets.FloatInput( + name="East / max longitude", + value=-84.8, + step=0.1, + ) + self.bbox_north = pn.widgets.FloatInput( + name="North / max latitude", + value=35.1, + step=0.1, + ) + + self.bbox_help = pn.pane.Markdown( + "Use geographic coordinates in EPSG:4326. \n" + "- longitude: -180 … 180 \n" + "- latitude: -90 … 90 \n" + "- west < east, south < north", + sizing_mode="stretch_width", + ) + + + self.protocols = pn.widgets.MultiChoice( + name="Allowed protocols (optional)", + options=["Traveling", "Stationary", "Area", "Incidental", "Historical"], + value=["Traveling", "Stationary", "Area"], + ) + self.chk_exclude_incidental = pn.widgets.Checkbox( + name="Exclude incidental/historical", + value=True, + ) + + self.chk_reviewed = pn.widgets.Checkbox( + name="REVIEWED", + value=False, + ) + + self.chk_approved = pn.widgets.Checkbox( + name="APPROVED", + value=False, + ) + + self.chk_all_species_reported = pn.widgets.Checkbox( + name="ALL SPECIES REPORTED", + value=False, + ) + + self.duration_min = pn.widgets.IntInput( + name="Min duration (minutes)", + value=0, + start=0, + ) + self.duration_max = pn.widgets.IntInput( + name="Max duration (minutes)", + value=600, + start=0, + ) + self.distance_min = pn.widgets.FloatInput( + name="Min distance (km)", + value=0.0, + start=0.0, + step=0.1, + ) + self.distance_max = pn.widgets.FloatInput( + name="Max distance (km)", + value=50.0, + start=0.0, + step=0.1, + ) + self.chk_require_valid_coords = pn.widgets.Checkbox( + name="Require valid coordinates", + value=True, + ) + self.max_count_clip = pn.widgets.IntInput( + name="Clip extreme counts above (0=off)", + value=0, + start=0, + ) + + today = dt.date.today() + self.date_start = pn.widgets.DatePicker( + name="Start date", + value=today - dt.timedelta(days=30), + ) + self.date_end = pn.widgets.DatePicker( + name="End date", + value=today, + ) + self.aggregation_days = pn.widgets.IntInput( + name="Aggregation step (days)", + value=7, + start=1, + ) + + self.grid_step_deg = pn.widgets.FloatInput( + name="Grid step (degrees, 0 = use original coordinates)", + value=0.0, + start=0.0, + step=0.1, + ) + + self.min_reporting_rate = pn.widgets.FloatInput( + name="Min frequency of detection (reporting_rate)", + value=0.0, + start=0.0, + step=0.01, + ) + self.min_count_per_complete_checklist = pn.widgets.FloatInput( + name="Min effort-standardized count", + value=0.0, + start=0.0, + step=0.1, + ) + self.min_sampling_support = pn.widgets.IntInput( + name="Min sampling support (n_complete_checklists)", + value=0, + start=0, + ) + + self.output_dir = pn.widgets.TextInput( + name="Output folder", + value=str(Path.home() / "Downloads"), + ) + self.run_name = pn.widgets.TextInput( + name="Run name", + value="presence_run", + ) + + # output filename for "tracks" export (used only to name the exported file in UI) + self.output_filename = pn.widgets.TextInput( + name="Output filename", + value="presence_points.csv", + placeholder="presence_points.csv", + ) + + self.btn_aggregate = pn.widgets.Button( + name="Aggregate", + button_type="primary", + ) + self.btn_export_tracks = pn.widgets.Button( + name="Export file for ECODATA-Animate", + button_type="primary", + icon="download", + ) + + self.species_select = pn.widgets.MultiChoice( + name="Species in results", + options=[], + value=[], + ) + + self.status = pn.pane.Alert("Ready.", alert_type="success") + self.log = pn.pane.Markdown("### Log\n", sizing_mode="stretch_both") + self.outputs_view = pn.pane.Markdown("### Outputs\nNo outputs yet.", sizing_mode="stretch_width") + + self.spatial_filter_mode.param.watch(self._on_spatial_mode_changed, "value") + + self.btn_aggregate.on_click(self._on_aggregate_clicked) + self.btn_export_tracks.on_click(self._on_export_tracks_clicked) + + # Rebuild layout (controls column vs results column) + self.sidebar = pn.Spacer(height=0) + self.main = pn.Column(self._build_main(), sizing_mode="stretch_both") + + + def _append_log(self, msg: str) -> None: + """Append log line with timestamp.""" + ts = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + self.log.object += f"\n- `{ts}` {msg}" + + def _set_status(self, msg: str, kind: str = "info") -> None: + """Set status alert message.""" + self.status.object = msg + self.status.alert_type = kind + + def _compute_paths(self) -> OutputPaths: + """Compute output paths from output_dir and run_name.""" + out_dir = _ensure_dir(self.output_dir.value) + run = _safe_filename(self.run_name.value, default="presence_run") + return OutputPaths( + out_dir=out_dir, + agg_counts_csv=os.path.join(out_dir, f"{run}__agg_counts.csv"), + agg_presence_csv=os.path.join(out_dir, f"{run}__agg_presence.csv"), + tracks_csv=os.path.join(out_dir, f"{run}__presence_points.csv"), + manifest_json=os.path.join(out_dir, f"{run}__manifest.json"), + ) + + def _format_coord_token(self, value: float) -> str: + """ + Format coordinate for safe use in region_id / filenames. + Example: -88.4667 -> m88p4667 + """ + s = f"{float(value):.4f}" + s = s.replace("-", "m").replace(".", "p") + return s + + + def _build_region_id( + self, + *, + bbox: tuple[float, float, float, float] | None, + polygon_filename_hint: str = "", + ) -> str: + """ + Build region_id from bbox coordinates or polygon filename. + """ + if bbox is not None: + west, south, east, north = bbox + return ( + "bbox_" + f"{self._format_coord_token(west)}_" + f"{self._format_coord_token(south)}_" + f"{self._format_coord_token(east)}_" + f"{self._format_coord_token(north)}" + ) + + name = os.path.basename(polygon_filename_hint or "").strip() + if name: + stem = os.path.splitext(name)[0] + safe = "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in stem) + safe = safe.strip("_") + if safe: + return f"poly_{safe}" + + return "poly_region" + + def _resolve_table_source(self, kind: str) -> tuple[str, str]: + """Resolve EBD or Sampling input as a local filesystem path.""" + kind = str(kind).strip().lower() + if kind not in {"ebd", "sampling"}: + raise ValueError(f"Unknown table kind: {kind}") + + selector = self.ebd_path if kind == "ebd" else self.sampling_path + label = "EBD" if kind == "ebd" else "Sampling" + + path = str(selector.value or "").strip() + if not path: + raise ValueError(f"Select {label} file with the file selector.") + if not os.path.exists(path): + raise ValueError(f"{label} path does not exist: {path}") + if not os.path.isfile(path): + raise ValueError(f"{label} path is not a file: {path}") + + return path, os.path.basename(path) + + def _resolve_polygon_source(self) -> tuple[str, str]: + """Resolve polygon input as a local filesystem path.""" + path = str(self.polygon_path.value or "").strip() + if not path: + raise ValueError("Select a polygon file with the file selector.") + if not os.path.exists(path): + raise ValueError(f"Polygon path does not exist: {path}") + if not os.path.isfile(path): + raise ValueError(f"Polygon path is not a file: {path}") + return path, os.path.basename(path) + + def _on_spatial_mode_changed(self, event) -> None: + """Refresh UI when spatial filter mode changes.""" + self.main[:] = [self._build_main()] + + def _build_sidebar(self) -> pn.Column: + """Build controls (left column).""" + + if self.spatial_filter_mode.value == "Region polygon": + spatial_controls = pn.Column( + self.spatial_filter_mode, + self.polygon_path, + sizing_mode="stretch_width", + ) + else: + spatial_controls = pn.Column( + self.spatial_filter_mode, + pn.Row( + self.bbox_west, + self.bbox_south, + self.bbox_east, + self.bbox_north, + sizing_mode="stretch_width", + ), + self.bbox_help, + sizing_mode="stretch_width", + ) + + io_box = pn.Column( + pn.pane.Markdown("#### 1. Inputs"), + pn.Row( + pn.Column( + pn.pane.Markdown("**EBD file**"), + self.ebd_path, + sizing_mode="stretch_width", + ), + + pn.Column( + pn.pane.Markdown("**Sampling Event file**"), + self.sampling_path, + sizing_mode="stretch_width", + ), + + sizing_mode="stretch_width", + ), + + pn.layout.Divider(), + + pn.pane.Markdown("#### 2. Spatial subset"), + spatial_controls, + + pn.layout.Divider(), + + pn.pane.Markdown("#### 3. Outputs"), + pn.Row( + self.output_dir, + self.run_name, + sizing_mode="stretch_width", + ), + sizing_mode="stretch_width", + ) + + vet_box = pn.Column( + pn.pane.Markdown("#### 4. Vetting / filtering"), + + # 1) all checkboxes in one row + pn.Row( + self.chk_reviewed, + self.chk_approved, + self.chk_all_species_reported, + self.chk_exclude_incidental, + self.chk_require_valid_coords, + sizing_mode="stretch_width", + ), + + # protocols + max count clip + pn.Row( + self.protocols, + self.max_count_clip, + sizing_mode="stretch_width", + ), + + # Min / Max duration in one row + pn.Row( + self.duration_min, + self.duration_max, + sizing_mode="stretch_width", + ), + + # Min / Max distance in next row + pn.Row( + self.distance_min, + self.distance_max, + sizing_mode="stretch_width", + ), + + sizing_mode="stretch_width", + ) + + + time_box = pn.Column( + pn.pane.Markdown("#### 5. Time and spatial aggregation"), + pn.Row( + self.date_start, + self.date_end, + self.aggregation_days, + self.grid_step_deg, + sizing_mode="stretch_width", + ), + pn.layout.Divider(), + pn.pane.Markdown("#### 6. Derived-metric filters"), + pn.Row( + self.min_reporting_rate, + self.min_count_per_complete_checklist, + self.min_sampling_support, + sizing_mode="stretch_width", + ), + sizing_mode="stretch_width", + ) + + actions = pn.Column( + pn.pane.Markdown("#### 7. Actions"), + pn.Row(self.btn_aggregate, sizing_mode="stretch_width"), + pn.layout.Divider(), + # 6) Species before export + add output filename in same row + pn.Row( + self.species_select, + self.output_filename, + self.btn_export_tracks, + sizing_mode="stretch_width", + ), + sizing_mode="stretch_width", + ) + + return pn.Column(io_box, vet_box, time_box, actions, sizing_mode="stretch_width") + + + def _build_main(self) -> pn.Row: + """Build 2-column layout: controls (wider) + outputs/log (narrower).""" + + controls = pn.Column( + pn.pane.Markdown("## Animal presence data preparation (eBird-compatible format)"), + self._build_sidebar(), + sizing_mode="stretch_both", + styles={"flex": "2"}, # 1) first column wider + ) + + results = pn.Column( + self.status, + self.outputs_view, + pn.layout.Divider(), + self.log, + sizing_mode="stretch_both", + styles={"flex": "1"}, # second column narrower + ) + + return pn.Row(controls, results, sizing_mode="stretch_both") + + def _apply_metric_filters_to_counts(self, counts_csv: str) -> List[str]: + """ + Apply derived-metric filters to aggregated counts CSV in place. + + Returns: + - updated species list after filtering + """ + if not counts_csv or not os.path.exists(counts_csv): + return [] + + df = pd.read_csv(counts_csv) + + if "reporting_rate" in df.columns: + df = df[df["reporting_rate"].fillna(-np.inf) >= float(self.min_reporting_rate.value or 0.0)] + + if "count_per_complete_checklist" in df.columns: + df = df[ + df["count_per_complete_checklist"].fillna(-np.inf) + >= float(self.min_count_per_complete_checklist.value or 0.0) + ] + + if "n_complete_checklists" in df.columns: + df = df[df["n_complete_checklists"].fillna(0) >= int(self.min_sampling_support.value or 0)] + + df.to_csv(counts_csv, index=False, encoding="utf-8") + return sorted(df["species"].dropna().astype(str).unique().tolist()) + + def _on_aggregate_clicked(self, _event) -> None: + """Run backend aggregation and update species list.""" + + try: + ebd_source, ebd_name = self._resolve_table_source("ebd") + sampling_source, sampling_name = self._resolve_table_source("sampling") + except Exception as e: + self._set_status(str(e), "danger") + self._append_log(f"Aggregation blocked: {e}") + return + + polygon_source = None + polygon_filename_hint = "" + bbox = None + + if self.spatial_filter_mode.value == "Region polygon": + try: + polygon_source, polygon_filename_hint = self._resolve_polygon_source() + except Exception as e: + self._set_status(str(e), "danger") + self._append_log(f"Aggregation blocked: {e}") + return + else: + bbox_values = [ + self.bbox_west.value, + self.bbox_south.value, + self.bbox_east.value, + self.bbox_north.value, + ] + if any(v is None for v in bbox_values): + self._set_status("Fill all four bbox coordinates.", "danger") + self._append_log("Aggregation blocked: incomplete bbox.") + return + bbox = tuple(float(v) for v in bbox_values) + region_id = self._build_region_id( + bbox=bbox, + polygon_filename_hint=polygon_filename_hint, + ) + start = self.date_start.value + end = self.date_end.value + if not start or not end or end < start: + self._set_status("Check start/end dates.", "danger") + self._append_log("Aggregation blocked: invalid dates.") + return + step_days = int(self.aggregation_days.value or 0) + if step_days < 1: + self._set_status("Aggregation step (days) must be >= 1.", "danger") + self._append_log("Aggregation blocked: invalid aggregation_days.") + return + grid_step_deg = float(self.grid_step_deg.value or 0.0) + if grid_step_deg < 0: + self._set_status("Grid step (degrees) must be >= 0.", "danger") + self._append_log("Aggregation blocked: invalid grid_step_deg.") + return + + self._paths = self._compute_paths() + + vet = VettingOptions( + require_reviewed=bool(self.chk_reviewed.value), + require_approved=bool(self.chk_approved.value), + require_all_species_reported=bool(self.chk_all_species_reported.value), + allowed_protocols=list(self.protocols.value) if self.protocols.value else None, + exclude_incidental_historical=bool(self.chk_exclude_incidental.value), + duration_min_minutes=int(self.duration_min.value or 0), + duration_max_minutes=int(self.duration_max.value or 600), + distance_min_km=float(self.distance_min.value or 0.0), + distance_max_km=float(self.distance_max.value or 50.0), + require_valid_coords=bool(self.chk_require_valid_coords.value), + clip_counts_above=int(self.max_count_clip.value or 0), + ) + + agg = AggregationOptions( + start_date=start, + end_date=end, + step_days=step_days, + grid_step_deg=grid_step_deg, + treat_x_as_one=True, + ) + + try: + self._set_status("Aggregating…", "warning") + self._append_log("Aggregation started.") + self._append_log(f"EBD source: local path -> {ebd_source}") + self._append_log(f"Sampling source: local path -> {sampling_source}") + + if bbox is not None: + self._append_log( + f"Using bbox: west={bbox[0]}, south={bbox[1]}, east={bbox[2]}, north={bbox[3]}." + ) + else: + self._append_log( + f"Using polygon: {polygon_filename_hint or '[unknown name]'}." + ) + self._append_log(f"Region ID: {region_id}") + self._append_log(f"Aggregation step: {step_days} day(s).") + if grid_step_deg > 0: + self._append_log(f"Grid aggregation enabled: {grid_step_deg} degree(s).") + else: + self._append_log("Grid aggregation disabled: using original observation coordinates.") + + self._append_log( + "Metric filters: " + f"reporting_rate >= {float(self.min_reporting_rate.value or 0.0)}, " + f"count_per_complete_checklist >= {float(self.min_count_per_complete_checklist.value or 0.0)}, " + f"n_complete_checklists >= {int(self.min_sampling_support.value or 0)}." + ) + + species_all = aggregate_ebird_to_files( + ebd_bytes=ebd_source, + sampling_bytes=sampling_source, + polygon_bytes=polygon_source, + polygon_filename_hint=polygon_filename_hint, + bbox=bbox, + ebd_filename_hint=ebd_name or "ebd", + sampling_filename_hint=sampling_name or "sampling", + region_id=region_id, + agg=agg, + vet=vet, + out_counts_csv=self._paths.agg_counts_csv, + out_presence_csv=self._paths.agg_presence_csv, + manifest_json=self._paths.manifest_json, + ) + + species = self._apply_metric_filters_to_counts(self._paths.agg_counts_csv) + + self.species_select.options = species + self.species_select.value = [] + + self._set_status("Aggregation complete.", "success") + self._append_log(f"Created: {self._paths.agg_counts_csv}") + self._append_log(f"Created: {self._paths.agg_presence_csv}") + + self.outputs_view.object = ( + "### Outputs\n" + f"- **Aggregated counts (A)**: `{self._paths.agg_counts_csv}`\n" + f"- **Presence/absence (B)**: `{self._paths.agg_presence_csv}`\n" + f"- **Manifest**: `{self._paths.manifest_json}`\n" + ) + + except Exception as e: + self._set_status(f"Aggregation failed: {e}", "danger") + self._append_log(f"Aggregation failed: {e}") + + def _on_export_tracks_clicked(self, _event) -> None: + """Export Movebank-like pseudo-tracks CSV from aggregated counts.""" + if not self._paths: + self._paths = self._compute_paths() + bbox = None + polygon_filename_hint = "" + + if self.spatial_filter_mode.value == "Region polygon": + polygon_path = str(self.polygon_path.value or "").strip() + polygon_filename_hint = os.path.basename(polygon_path) if polygon_path else "" + else: + bbox_values = [ + self.bbox_west.value, + self.bbox_south.value, + self.bbox_east.value, + self.bbox_north.value, + ] + if not any(v is None for v in bbox_values): + bbox = tuple(float(v) for v in bbox_values) + + region_id = self._build_region_id( + bbox=bbox, + polygon_filename_hint=polygon_filename_hint, + ) + self._append_log(f"Export region ID: {region_id}") + try: + export_tracks_from_aggregated_counts( + agg_counts_csv=self._paths.agg_counts_csv, + tracks_csv=self._paths.tracks_csv, + region_id=region_id, + id_mode="species", + species_filter=list(self.species_select.value) if self.species_select.value else None, + ) + + sp = read_species_from_agg_counts(self._paths.agg_counts_csv) + self.species_select.options = sp + + self._set_status("Export complete.", "success") + self._append_log(f"Created: {self._paths.tracks_csv}") + + self.outputs_view.object = ( + (self.outputs_view.object or "### Outputs\n") + + f"\n- **presence_points.csv (for Animate)**: `{self._paths.tracks_csv}`\n" + ) + + except Exception as e: + self._set_status(f"Export failed: {e}", "danger") + self._append_log(f"Export failed: {e}") + + +@register_view(ext_args=["floatpanel"]) +def view(): + """Create a fresh app instance and return a template for ECODATA routing.""" + app = EbirdPrepareApp() + template = DEFAULT_TEMPLATE( + main=[app.main], + sidebar=[], + ) + return template + + +if __name__ == "__main__": + pn.serve({Path(__file__).name: view}) + + +if __name__.startswith("bokeh"): + view() diff --git a/ecodata/movebank_functions.py b/ecodata/movebank_functions.py index cb22f36..a0e5cff 100644 --- a/ecodata/movebank_functions.py +++ b/ecodata/movebank_functions.py @@ -75,7 +75,7 @@ def _normalize_frac(text: str) -> str: # --- Try explicit known formats (old + new) --- fmts = [ - # legacy ISO-like (kept first for backward compatibility) + # ISO-like (kept first for compatibility) "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", @@ -556,7 +556,7 @@ def split_into_sessions(data, max_gap_minutes): "eobs_start_timestamp", "eobs_temperature", "ground_speed", "height_above_ellipsoid" ] - if result_paths: # перевірка, що є створені файли + if result_paths: last_file = result_paths[-1] try: df_check = normalize_column_names(pd.read_csv(last_file, low_memory=False)) @@ -917,7 +917,7 @@ def _norm(s: str) -> str: lon_key = next((norm_map[_norm(c)] for c in lon_syn if _norm(c) in norm_map), None) lat_key = next((norm_map[_norm(c)] for c in lat_syn if _norm(c) in norm_map), None) - # soft fallback to legacy dash-style names if present + # fallback : dash-style names if present if lon_key is None and "location-long" in fieldnames: lon_key = "location-long" if lat_key is None and "location-lat" in fieldnames: diff --git a/ecodata/multidim_annotation_func.py b/ecodata/multidim_annotation_func.py new file mode 100644 index 0000000..a07d591 --- /dev/null +++ b/ecodata/multidim_annotation_func.py @@ -0,0 +1,2228 @@ +""" +Multidimensional annotation backend for ECODATA-Prepare. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, Union +import logging +import math +import re + +import numpy as np +import pandas as pd +import xarray as xr + +try: + import geopandas as gpd + from shapely.geometry import Point +except Exception: # pragma: no cover + gpd = None + Point = None + +try: + from ecodata.annotation_eng_func import ( + safe_open_nc_with_time_decoding, + get_nc_bounds, + load_vector_extent_info, + _k_nearest_indices as ae_k_nearest_indices, + _idw as ae_idw, + ) +except Exception: # pragma: no cover + safe_open_nc_with_time_decoding = None + get_nc_bounds = None + load_vector_extent_info = None + ae_k_nearest_indices = None + ae_idw = None + +LOGGER = logging.getLogger(__name__) +G0 = 9.80665 +_GEOID_MODEL = None + +TIME_CANDIDATES = ("time", "valid_time", "forecast_time", "verification_time", "datetime", "date") +LAT_CANDIDATES = ("lat", "latitude", "y") +LON_CANDIDATES = ("lon", "longitude", "long", "x") +LEVEL_CANDIDATES = ("level", "lev", "plev", "pressure", "pressure_level", "isobaricInhPa", "isobaric_in_hPa") + +VerticalMethod = Literal["nearest", "linear"] +HorizontalMethod = Literal["nearest", "idw"] +HeightReference = Literal["already_orthometric", "already_msl", "ellipsoidal", "agl"] +GeoidMode = Literal["none", "constant", "geographiclib", "pyproj_grid"] +VariableType = Literal["continuous", "categorical"] + + +@dataclass +class DatasetSpec: + path: Union[str, Path] + variables: List[str] + continuous: List[str] = field(default_factory=list) + categorical: List[str] = field(default_factory=list) + label_prefix: str = "" + + @classmethod + def from_single(cls, path: Union[str, Path], variable: Optional[str], label_prefix: str = "") -> Optional["DatasetSpec"]: + if not path or not variable: + return None + return cls(path=path, variables=[variable], continuous=[variable], categorical=[], label_prefix=label_prefix) + + +@dataclass +class OptionalComponentSpec: + path: Optional[Union[str, Path]] = None + variable: Optional[str] = None + label: str = "" + + def is_enabled(self) -> bool: + return bool(self.path and self.variable) + + +@dataclass +class MultidimAnnotationConfig: + movement_csv: Union[str, Path] + output_csv: Union[str, Path] + + id_col: str + time_col: str + lat_col: str + lon_col: str + height_col: str + + geopotential_file: Union[str, Path] + geopotential_variable: str + multilevel: DatasetSpec + + selected_ids: Optional[List[str]] = None + boundary_path: Optional[Union[str, Path]] = None + bbox: Optional[Dict[str, float]] = None + + coord_spec: Optional[Dict[str, Optional[str]]] = None + geopotential_units: Optional[str] = "m2 s-2" + convert_geopotential_to_height: bool = True + gravity_constant: float = G0 + + spatial_method: HorizontalMethod = "nearest" + smoothing_k: int = 1 + vertical_method: VerticalMethod = "nearest" + keep_diagnostics: bool = True + save_per_individual: bool = False + + height_reference: HeightReference = "ellipsoidal" + geoid_mode: GeoidMode = "geographiclib" + constant_geoid_undulation_m: float = 0.0 + geoid_grid_path: Optional[Union[str, Path]] = None + + surface: Optional[DatasetSpec] = None + use_surface_as_lower_anchor: bool = True + surface_height_agl_m: float = 2.0 + + dem_file: Optional[Union[str, Path]] = None + dem_units: str = "m" + dem_reference: str = "orthometric" + + u_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + v_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + w_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + temperature_component: OptionalComponentSpec = field(default_factory=OptionalComponentSpec) + + derive_wind_speed_direction: bool = False + derive_wind_support_crosswind: bool = False + derive_vertical_motion: bool = False + derive_thermal_proxy: bool = False + derive_orographic_uplift: bool = False + + heading_col: Optional[str] = None + heading_source: Literal["compute", "column"] = "compute" + + allow_vertical_extrapolation: bool = False + + +def _as_path(path: Union[str, Path]) -> Path: + return Path(path).expanduser().resolve() + + +def _require_file(path: Union[str, Path], label: str) -> Path: + p = _as_path(path) + if not p.exists() or not p.is_file(): + raise FileNotFoundError(f"{label} not found or is not a file: {p}") + return p + + +def _normalise_name(name: str) -> str: + return re.sub(r"[-:.\s]+", "_", str(name).lower()) + + +def _unique(*values: Iterable[str]) -> List[str]: + out: List[str] = [] + seen = set() + for seq in values: + for val in list(seq or []): + if val not in seen: + seen.add(val) + out.append(val) + return out + + +def _normalize_vertical_method(value: str) -> VerticalMethod: + v = str(value or "").strip().lower() + return "linear" if ("linear" in v or "interpol" in v) else "nearest" + + +def _normalize_spatial_method(value: str) -> HorizontalMethod: + v = str(value or "").strip().lower() + if "idw" in v or "inverse" in v: + return "idw" + return "nearest" + + +def _safe_float(value: Any) -> float: + try: + return float(value) + except Exception: + return np.nan + + +def parse_movebank_timestamp_series(series: pd.Series, col_name: str = "timestamp") -> pd.Series: + raw = series.copy() + attempts: List[pd.Series] = [] + + for kwargs in ( + {"errors": "coerce", "utc": False}, + {"errors": "coerce", "utc": False, "dayfirst": True}, + {"errors": "coerce", "utc": False, "format": "mixed"}, + {"errors": "coerce", "utc": False, "format": "ISO8601"}, + ): + try: + attempts.append(pd.to_datetime(raw, **kwargs)) + except Exception: + pass + + out = attempts[0] if attempts else pd.to_datetime(raw, errors="coerce") + for parsed in attempts[1:]: + out = out.fillna(parsed) + + if out.isna().any(): + numeric = pd.to_numeric(raw, errors="coerce") + numeric_attempts = [] + for unit in ("s", "ms", "us", "ns"): + try: + numeric_attempts.append(pd.to_datetime(numeric, errors="coerce", unit=unit, utc=False)) + except Exception: + pass + if numeric_attempts: + best = max(numeric_attempts, key=lambda x: int(x.notna().sum())) + out = out.fillna(best) + + if out.isna().any(): + bad_mask = out.isna() + examples = raw[bad_mask].astype(str).head(10).tolist() + raise ValueError( + f"Timestamp column '{col_name}' contains {int(bad_mask.sum())} unparsable value(s). " + f"Examples: {examples}" + ) + + try: + if getattr(out.dt, "tz", None) is not None: + out = out.dt.tz_convert(None) + except Exception: + pass + return out + + +def _find_name(obj: Union[xr.Dataset, xr.DataArray], candidates: Sequence[str]) -> Optional[str]: + names: List[str] = [] + if isinstance(obj, xr.Dataset): + names.extend([str(x) for x in obj.coords]) + names.extend([str(x) for x in obj.dims]) + names.extend([str(x) for x in obj.variables]) + else: + names.extend([str(x) for x in obj.coords]) + names.extend([str(x) for x in obj.dims]) + lower = {n.lower(): n for n in names} + for cand in candidates: + if cand in names: + return cand + if cand.lower() in lower: + return lower[cand.lower()] + return None + + +def _coord_names(ds: xr.Dataset, coord_spec: Optional[Dict[str, Optional[str]]] = None, require_level: bool = False) -> Dict[str, Optional[str]]: + spec = coord_spec or {} + names = { + "time": spec.get("time") or _find_name(ds, TIME_CANDIDATES), + "lat": spec.get("lat") or _find_name(ds, LAT_CANDIDATES), + "lon": spec.get("lon") or _find_name(ds, LON_CANDIDATES), + "level": spec.get("level") or _find_name(ds, LEVEL_CANDIDATES), + } + for key, val in list(names.items()): + if val and val not in ds.variables and val not in ds.coords and val not in ds.dims: + names[key] = None + missing = [k for k in ("time", "lat", "lon") if names[k] is None] + if require_level and names["level"] is None: + missing.append("level") + if missing: + raise ValueError(f"Dataset is missing required coordinate(s): {', '.join(missing)}") + return names + + +def _rename_standard_coords(ds: xr.Dataset, coord_spec: Optional[Dict[str, Optional[str]]] = None) -> xr.Dataset: + names = _coord_names(ds, coord_spec, require_level=False) + mapping = {} + for std in ("time", "lat", "lon", "level"): + src = names.get(std) + if src and src != std and src in ds.variables: + mapping[src] = std + elif src and src != std and src in ds.dims: + mapping[src] = std + if mapping: + ds = ds.rename(mapping) + if "lat" in ds: + try: + vals = np.asarray(ds["lat"].values, dtype=float) + if vals.ndim == 1 and vals.size > 1 and vals[0] > vals[-1]: + ds = ds.sortby("lat") + except Exception: + pass + if "lon" in ds: + try: + vals = np.asarray(ds["lon"].values, dtype=float) + if vals.ndim == 1 and vals.size > 1 and vals[0] > vals[-1]: + ds = ds.sortby("lon") + except Exception: + pass + return ds + + +def open_dataset(path: Union[str, Path], coord_spec: Optional[Dict[str, Optional[str]]] = None) -> xr.Dataset: + p = _require_file(path, "NetCDF file") + if safe_open_nc_with_time_decoding is not None: + try: + ds = safe_open_nc_with_time_decoding(str(p)) + except Exception: + ds = xr.open_dataset(p, decode_times=True) + else: + try: + ds = xr.open_dataset(p, decode_times=True) + except Exception: + ds = xr.open_dataset(p, decode_times=False) + return _rename_standard_coords(ds, coord_spec) + + +def _wrap_lon(lon: float, lon_values: np.ndarray) -> float: + vals = np.asarray(lon_values, dtype=float) + finite = vals[np.isfinite(vals)] + if finite.size == 0: + return float(lon) + mn, mx = float(np.nanmin(finite)), float(np.nanmax(finite)) + if mn >= 0 and mx > 180 and lon < 0: + return float(lon) % 360.0 + if mn < 0 and mx <= 180 and lon > 180: + return ((float(lon) + 180.0) % 360.0) - 180.0 + return float(lon) + + +def _time_value(t: pd.Timestamp) -> Any: + return np.datetime64(pd.Timestamp(t).to_datetime64()) + + +def _nearest_time_index(values: np.ndarray, t: pd.Timestamp) -> int: + times = pd.to_datetime(values) + arr = times.to_numpy(dtype="datetime64[ns]").astype("int64") + target = np.datetime64(pd.Timestamp(t).to_datetime64()).astype("datetime64[ns]").astype("int64") + return int(np.nanargmin(np.abs(arr - target))) + + +def _select_time_space( + da: xr.DataArray, + t: pd.Timestamp, + lat: float, + lon: float, + *, + time_method: Literal["nearest", "linear"] = "linear", + spatial_method: Literal["nearest", "linear"] = "linear", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> xr.DataArray: + lat_q = float(lat if fixed_lat is None else fixed_lat) + lon_q = float(lon if fixed_lon is None else fixed_lon) + lon_q = _wrap_lon(lon_q, np.asarray(da["lon"].values)) if "lon" in da.coords else lon_q + + out = da + + if spatial_method == "nearest": + isel_indexers: Dict[str, int] = {} + if "lat" in out.dims and "lat" in out.coords: + isel_indexers["lat"] = _nearest_index(np.asarray(out["lat"].values, dtype=float), lat_q) + if "lon" in out.dims and "lon" in out.coords: + isel_indexers["lon"] = _nearest_index(np.asarray(out["lon"].values, dtype=float), lon_q) + if isel_indexers: + out = out.isel(isel_indexers) + else: + spatial_indexers = {} + if "lat" in out.dims or "lat" in out.coords: + spatial_indexers["lat"] = lat_q + if "lon" in out.dims or "lon" in out.coords: + spatial_indexers["lon"] = lon_q + if spatial_indexers: + try: + out = out.interp(spatial_indexers, method="linear") + except Exception: + out = out.sel(spatial_indexers, method="nearest") + + if "time" in out.dims or "time" in out.coords: + if time_method == "nearest": + try: + if "time" in out.dims: + out = out.isel({"time": _nearest_time_index(out["time"].values, pd.Timestamp(t))}) + else: + out = out.sel({"time": _time_value(pd.Timestamp(t))}, method="nearest") + except Exception: + out = out.sel({"time": _time_value(pd.Timestamp(t))}, method="nearest") + else: + try: + out = out.interp({"time": _time_value(pd.Timestamp(t))}, method="linear") + except Exception: + out = out.sel({"time": _time_value(pd.Timestamp(t))}, method="nearest") + + return out.squeeze(drop=True) + + +def geopotential_to_height_m( + da: xr.DataArray, + units_override: Optional[str] = None, + convert_geopotential_to_height: bool = True, + gravity_constant: float = G0, +) -> xr.DataArray: + units = (units_override or da.attrs.get("units") or "").lower() + norm = units.replace("**", "^").replace("/", " ") + is_height = norm.strip() in {"m", "meter", "meters", "metre", "metres"} or "geopotential metre" in norm or "gpm" in norm + is_geopotential = any(x in norm for x in ("m^2 s^-2", "m2 s-2", "m2 s^-2", "m2 s**-2")) + if convert_geopotential_to_height and (is_geopotential or not is_height): + out = da / float(gravity_constant) + out.attrs["units"] = "m" + return out.rename("geopotential_height_m") + out = da.copy() + out.attrs["units"] = "m" + return out.rename("geopotential_height_m") + + +def sample_level_profile( + ds: xr.Dataset, + variable: str, + *, + t: pd.Timestamp, + lat: float, + lon: float, + time_method: Literal["nearest", "linear"] = "linear", + spatial_method: Literal["nearest", "linear"] = "linear", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Tuple[np.ndarray, np.ndarray]: + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in dataset.") + if "level" not in ds[variable].dims and "level" not in ds[variable].coords: + raise ValueError(f"Variable '{variable}' has no level dimension.") + prof = _select_time_space( + ds[variable], t, lat, lon, + time_method=time_method, + spatial_method=spatial_method, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + if "level" not in prof.dims and "level" in prof.coords: + prof = prof.expand_dims({"level": prof["level"]}) + prof = prof.transpose("level", ...).squeeze(drop=True) + levels = np.asarray(prof["level"].values) + values = np.asarray(prof.values).astype(float).reshape(-1) + return levels, values + + +def sample_geopotential_profile( + ds: xr.Dataset, + variable: str, + *, + t: pd.Timestamp, + lat: float, + lon: float, + units_override: Optional[str], + convert_geopotential_to_height: bool, + gravity_constant: float, + time_method: Literal["nearest", "linear"] = "linear", + spatial_method: Literal["nearest", "linear"] = "linear", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Tuple[np.ndarray, np.ndarray]: + z = geopotential_to_height_m(ds[variable], units_override, convert_geopotential_to_height, gravity_constant) + tmp = z.to_dataset(name="geopotential_height_m") + return sample_level_profile( + tmp, "geopotential_height_m", t=t, lat=lat, lon=lon, + time_method=time_method, spatial_method=spatial_method, + fixed_lat=fixed_lat, fixed_lon=fixed_lon, + ) + + +def sample_surface_value( + ds: xr.Dataset, + variable: str, + *, + t: pd.Timestamp, + lat: float, + lon: float, + variable_type: VariableType = "continuous", + spatial_method: HorizontalMethod = "nearest", + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Any: + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in surface dataset.") + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + spatial_xr: Literal["nearest", "linear"] = "nearest" if (variable_type == "categorical" or spatial_method == "nearest") else "linear" + da = _select_time_space( + ds[variable], t, lat, lon, + time_method=time_method, + spatial_method=spatial_xr, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + arr = np.asarray(da.values) + return arr.squeeze().item() if arr.size else np.nan + + +def _prepare_vertical_nodes(levels: Sequence[Any], heights: Sequence[float], values: Sequence[float], surface_value=None, surface_height=None) -> pd.DataFrame: + rows = [] + for lev, z, val in zip(levels, heights, values): + zf, vf = _safe_float(z), _safe_float(val) + if np.isfinite(zf) and np.isfinite(vf): + rows.append({"level": lev, "height_m": zf, "value": vf, "is_surface": False}) + if surface_value is not None and surface_height is not None: + sv, sh = _safe_float(surface_value), _safe_float(surface_height) + if np.isfinite(sv) and np.isfinite(sh): + rows.append({"level": "surface", "height_m": sh, "value": sv, "is_surface": True}) + if not rows: + return pd.DataFrame(columns=["level", "height_m", "value", "is_surface"]) + return pd.DataFrame(rows).sort_values("height_m", kind="mergesort").reset_index(drop=True) + + +def vertical_sample( + levels: Sequence[Any], + heights_m: Sequence[float], + values: Sequence[float], + target_height_m: float, + *, + method: VerticalMethod = "nearest", + variable_type: VariableType = "continuous", + surface_value: Optional[float] = None, + surface_height_m: Optional[float] = None, + allow_extrapolation: bool = False, +) -> Tuple[float, Dict[str, Any]]: + if variable_type == "categorical": + method = "nearest" + H = _safe_float(target_height_m) + nodes = _prepare_vertical_nodes(levels, heights_m, values, surface_value, surface_height_m) + diag: Dict[str, Any] = { + "vertical_method": method, + "target_height_msl_m": H, + "matched_level": np.nan, + "matched_level_height_m": np.nan, + "height_difference_m": np.nan, + "lower_level": np.nan, + "upper_level": np.nan, + "lower_height_m": np.nan, + "upper_height_m": np.nan, + "vertical_weight_upper": np.nan, + "surface_anchor_used": False, + "vertical_out_of_range": False, + "vertical_warning": "", + } + if not np.isfinite(H): + diag["vertical_warning"] = "invalid_target_height" + return np.nan, diag + if nodes.empty: + diag["vertical_warning"] = "empty_vertical_profile" + return np.nan, diag + z = nodes["height_m"].to_numpy(dtype=float) + v = nodes["value"].to_numpy(dtype=float) + if method == "nearest" or len(nodes) == 1: + idx = int(np.nanargmin(np.abs(z - H))) + diag.update({ + "matched_level": nodes.loc[idx, "level"], + "matched_level_height_m": float(z[idx]), + "height_difference_m": float(H - z[idx]), + "surface_anchor_used": bool(nodes.loc[idx, "is_surface"]), + }) + return float(v[idx]), diag + if H < z[0]: + diag["vertical_out_of_range"] = True + if not allow_extrapolation: + diag["vertical_warning"] = "below_lowest_vertical_node" + return np.nan, diag + lo, hi = 0, min(1, len(z) - 1) + elif H > z[-1]: + diag["vertical_out_of_range"] = True + if not allow_extrapolation: + diag["vertical_warning"] = "above_highest_vertical_node" + return np.nan, diag + lo, hi = max(0, len(z) - 2), len(z) - 1 + else: + hi = int(np.searchsorted(z, H, side="left")) + if hi == 0: + lo = hi = 0 + elif hi < len(z) and np.isclose(z[hi], H): + lo = hi + else: + lo, hi = hi - 1, min(hi, len(z) - 1) + if lo == hi or np.isclose(z[lo], z[hi]): + val, w = float(v[lo]), 0.0 + else: + w = float((H - z[lo]) / (z[hi] - z[lo])) + val = float(v[lo] * (1 - w) + v[hi] * w) + nearest = lo if abs(H - z[lo]) <= abs(H - z[hi]) else hi + diag.update({ + "lower_level": nodes.loc[lo, "level"], + "upper_level": nodes.loc[hi, "level"], + "lower_height_m": float(z[lo]), + "upper_height_m": float(z[hi]), + "vertical_weight_upper": float(w), + "matched_level": nodes.loc[nearest, "level"], + "matched_level_height_m": float(z[nearest]), + "height_difference_m": float(H - z[nearest]), + "surface_anchor_used": bool(nodes.loc[lo, "is_surface"] or nodes.loc[hi, "is_surface"]), + }) + return val, diag + + +def sample_dem_elevation(dem_file: Optional[Union[str, Path]], lat: float, lon: float) -> Tuple[float, str]: + if not dem_file: + return np.nan, "dem_not_provided" + try: + import rasterio + from pyproj import Transformer + except Exception: + return np.nan, "rasterio_or_pyproj_not_available" + try: + path = _require_file(dem_file, "DEM file") + with rasterio.open(path) as src: + x, y = float(lon), float(lat) + if src.crs is not None and str(src.crs).upper() not in {"EPSG:4326", "OGC:CRS84"}: + x, y = Transformer.from_crs("EPSG:4326", src.crs, always_xy=True).transform(x, y) + row, col = src.index(x, y) + if row < 0 or col < 0 or row >= src.height or col >= src.width: + return np.nan, "point_outside_dem" + arr = src.read(1, window=((row, row + 1), (col, col + 1)), masked=True) + if np.ma.is_masked(arr) and bool(np.ma.getmaskarray(arr).squeeze()): + return np.nan, "dem_nodata" + val = float(np.asarray(arr).squeeze()) + if src.nodata is not None and np.isclose(val, float(src.nodata), equal_nan=True): + return np.nan, "dem_nodata" + return val, "" + except Exception as exc: + return np.nan, f"dem_sampling_failed:{exc}" + +def compute_dem_slope_aspect( + dem_file: Union[str, Path], + lat: float, + lon: float, + sample_radius_px: int = 1, +) -> Tuple[float, float]: + """ + Compute terrain slope and aspect at a given point from a DEM raster. + + Uses central-difference finite differences on the surrounding pixel + neighbourhood to estimate first-order partial derivatives of elevation. + + Args: + dem_file: Path to the DEM raster (GeoTIFF or similar). + lat, lon: Geographic coordinates of the query point (degrees). + sample_radius_px: Half-size of the pixel window used for finite + differences (default 1 = 3x3 window). + + Returns: + (slope_rad, aspect_rad) where + slope_rad -- terrain slope angle from horizontal (radians, 0..π/2). + aspect_rad -- upslope direction measured clockwise from North (radians, + 0..2π), i.e. the direction the slope faces. + Both values are NaN on error or where the DEM has no data. + """ + try: + import rasterio + from pyproj import Transformer + except Exception: + return np.nan, np.nan + + try: + path = _require_file(dem_file, "DEM file") + with rasterio.open(path) as src: + x, y = float(lon), float(lat) + if src.crs is not None and str(src.crs).upper() not in {"EPSG:4326", "OGC:CRS84"}: + x, y = Transformer.from_crs( + "EPSG:4326", src.crs, always_xy=True + ).transform(x, y) + + row_c, col_c = src.index(x, y) + r0 = max(0, row_c - sample_radius_px) + r1 = min(src.height, row_c + sample_radius_px + 1) + c0 = max(0, col_c - sample_radius_px) + c1 = min(src.width, col_c + sample_radius_px + 1) + if r1 - r0 < 2 or c1 - c0 < 2: + return np.nan, np.nan + + patch = src.read(1, window=((r0, r1), (c0, c1)), masked=True).astype(float) + if src.nodata is not None: + patch[patch == float(src.nodata)] = np.nan + + # Pixel size in metres + res_x = abs(src.transform.a) + res_y = abs(src.transform.e) + if src.crs is not None and src.crs.is_geographic: + # Convert arc-degrees to metres at this latitude + lat_rad = math.radians(float(lat)) + res_x = res_x * math.pi / 180.0 * 6_371_000.0 * math.cos(lat_rad) + res_y = res_y * math.pi / 180.0 * 6_371_000.0 + + rows, cols = patch.shape + cr, cc = rows // 2, cols // 2 + + # East-west gradient (positive = elevation increases eastward) + if 0 < cc < cols - 1: + dz_dx = (patch[cr, cc + 1] - patch[cr, cc - 1]) / (2.0 * res_x) + elif cc < cols - 1: + dz_dx = (patch[cr, cc + 1] - patch[cr, cc]) / res_x + else: + dz_dx = (patch[cr, cc] - patch[cr, cc - 1]) / res_x + + # North-south gradient (positive = elevation increases northward). + # Rasterio row index increases southward, so north = smaller row index. + if 0 < cr < rows - 1: + dz_dy = (patch[cr - 1, cc] - patch[cr + 1, cc]) / (2.0 * res_y) + elif cr < rows - 1: + dz_dy = (patch[cr, cc] - patch[cr + 1, cc]) / res_y + else: + dz_dy = (patch[cr - 1, cc] - patch[cr, cc]) / res_y + + if not (np.isfinite(dz_dx) and np.isfinite(dz_dy)): + return np.nan, np.nan + + slope = math.atan(math.sqrt(dz_dx**2 + dz_dy**2)) + # Aspect: direction the slope faces, clockwise from North. + # atan2(dz_dx, dz_dy) maps (east gradient, north gradient) + # to the bearing of the upslope direction. + aspect = (math.atan2(dz_dx, dz_dy) + 2.0 * math.pi) % (2.0 * math.pi) + return float(slope), float(aspect) + + except Exception: + return np.nan, np.nan + +def compute_orographic_uplift( + u10_ms: float, + v10_ms: float, + slope_rad: float, + aspect_rad: float, +) -> float: + """ + Orographic updraft velocity Wo (m/s) following Bohrer et al. (2012). + + Wo = V_surface * sin(slope) * cos(wind_from - aspect) + + where wind_from is the direction the wind is blowing *from* + (meteorological convention: easterly wind -> 90°). + + Positive Wo = wind blowing onto the upslope face -> updraft. + Negative Wo = wind blowing off the slope (lee side) -> downdraft. + + Args: + u10_ms: ERA5 10-metre U-component of wind (m/s, eastward positive). + v10_ms: ERA5 10-metre V-component of wind (m/s, northward positive). + slope_rad: Terrain slope angle in radians, from compute_dem_slope_aspect(). + aspect_rad: Upslope-facing direction clockwise from North (radians). + + Returns: + Wo in m/s, or NaN if any input is missing. + """ + u, v = float(u10_ms), float(v10_ms) + slope = float(slope_rad) + aspect = float(aspect_rad) + + if not all(np.isfinite([u, v, slope, aspect])): + return np.nan + + V = math.sqrt(u * u + v * v) + if V < 1e-6: + return 0.0 + + # Direction the wind is blowing FROM, clockwise from North (radians). + # atan2(u, v): u=east component, v=north component gives bearing from North. + wind_from = (math.atan2(u, v) + 2.0 * math.pi) % (2.0 * math.pi) + + return float(V * math.sin(slope) * math.cos(wind_from - aspect)) + +def geoid_undulation_geographiclib(lat: float, lon: float) -> Optional[float]: + global _GEOID_MODEL + try: + from geographiclib.geoid import Geoid + if _GEOID_MODEL is None: + _GEOID_MODEL = Geoid("egm2008") + llon = ((float(lon) + 180.0) % 360.0) - 180.0 + return float(_GEOID_MODEL.Height(float(lat), llon)) + except Exception: + return None + + +def geoid_undulation_pyproj(lat: float, lon: float, grid_path: Optional[Union[str, Path]]) -> Optional[float]: + if not grid_path: + return None + try: + from pyproj import CRS, Transformer + crs_geog_3d = CRS.from_epsg(4979) + pipeline = f"+proj=pipeline +step +proj=vgridshift +grids={_as_path(grid_path)} +multiplier=1" + transformer = Transformer.from_crs(crs_geog_3d, CRS.from_pipeline(pipeline), always_xy=True) + h0 = 100.0 + H = transformer.transform(float(lon), float(lat), h0)[2] + return float(h0 - H) + except Exception: + return None + + +def compute_orthometric_height( + raw_height_m: float, + lat: float, + lon: float, + *, + height_reference: HeightReference, + geoid_mode: GeoidMode, + constant_geoid_undulation_m: float, + geoid_grid_path: Optional[Union[str, Path]] = None, + terrain_elevation_m: Optional[float] = None, +) -> Tuple[float, Dict[str, Any]]: + h = _safe_float(raw_height_m) + diag: Dict[str, Any] = { + "height_input_m": h, + "height_reference": height_reference, + "geoid_mode": geoid_mode, + "geoid_undulation_m": np.nan, + "height_conversion_warning": "", + } + if not np.isfinite(h): + diag["height_conversion_warning"] = "invalid_height" + return np.nan, diag + if height_reference in ("already_orthometric", "already_msl"): + return h, diag + if height_reference == "agl": + terrain = np.nan if terrain_elevation_m is None else float(terrain_elevation_m) + if not np.isfinite(terrain): + diag["height_conversion_warning"] = "agl_height_without_valid_dem" + return np.nan, diag + return terrain + h, diag + N: Optional[float] = None + if geoid_mode == "geographiclib": + N = geoid_undulation_geographiclib(lat, lon) + elif geoid_mode == "pyproj_grid": + N = geoid_undulation_pyproj(lat, lon, geoid_grid_path) + elif geoid_mode == "constant": + N = float(constant_geoid_undulation_m) + elif geoid_mode == "none": + N = 0.0 + if N is None: + N = float(constant_geoid_undulation_m) + diag["height_conversion_warning"] = "geoid_lookup_failed_used_constant_N" + diag["geoid_undulation_m"] = float(N) + return h - float(N), diag + + +def _nearest_index(arr: np.ndarray, x: float) -> int: + arr = np.asarray(arr, dtype=float) + idx = int(np.searchsorted(arr, x)) + if idx <= 0: + return 0 + if idx >= len(arr): + return len(arr) - 1 + return idx if abs(arr[idx] - x) < abs(arr[idx - 1] - x) else idx - 1 + + +def _k_nearest_indices(glat: np.ndarray, glon: np.ndarray, lat: float, lon: float, k: int) -> List[Tuple[int, int]]: + if ae_k_nearest_indices is not None: + try: + return list(ae_k_nearest_indices(glat, glon, lat, lon, k)) + except Exception: + pass + i0 = _nearest_index(glat, lat) + j0 = _nearest_index(glon, lon) + r = int(np.ceil(max(1, np.sqrt(k)))) + candidates = [] + for ii in range(max(0, i0 - r), min(len(glat) - 1, i0 + r) + 1): + for jj in range(max(0, j0 - r), min(len(glon) - 1, j0 + r) + 1): + d = float(np.hypot(glat[ii] - lat, glon[jj] - lon)) + candidates.append((d, ii, jj)) + candidates.sort(key=lambda x: x[0]) + return [(ii, jj) for _, ii, jj in candidates[:k]] + + +def _idw(values: Sequence[float], distances: Sequence[float], p: float = 2.0) -> float: + if ae_idw is not None: + try: + return float(ae_idw(values, distances, p=p)) + except Exception: + pass + vals = np.asarray(values, dtype=float) + d = np.asarray(distances, dtype=float) + 1e-12 + mask = np.isfinite(vals) + if not mask.any(): + return np.nan + w = 1.0 / (d[mask] ** p) + return float(np.sum(vals[mask] * w) / np.sum(w)) + + +def _filter_boundary(df: pd.DataFrame, lat_col: str, lon_col: str, boundary_path: Optional[Union[str, Path]], bbox: Optional[Dict[str, float]]) -> pd.DataFrame: + out = df.copy() + if bbox: + S = float(bbox.get("S", bbox.get("south"))) + N = float(bbox.get("N", bbox.get("north"))) + W = float(bbox.get("W", bbox.get("west"))) + E = float(bbox.get("E", bbox.get("east"))) + return out[out[lat_col].between(S, N) & out[lon_col].between(W, E)].copy() + if not boundary_path: + return out + if gpd is None or Point is None: + raise RuntimeError("geopandas/shapely are required for boundary filtering.") + boundary = gpd.read_file(_require_file(boundary_path, "Boundary file")) + points = gpd.GeoDataFrame(out, geometry=[Point(xy) for xy in zip(out[lon_col], out[lat_col])], crs="EPSG:4326") + if boundary.crs != points.crs: + boundary = boundary.to_crs(points.crs) + clipped = gpd.sjoin(points, boundary[["geometry"]], predicate="within", how="inner").drop(columns=["index_right", "geometry"], errors="ignore") + return pd.DataFrame(clipped) + + +def _dataset_bounds(ds: xr.Dataset) -> Optional[Dict[str, float]]: + try: + return {"S": float(ds["lat"].min()), "N": float(ds["lat"].max()), "W": float(ds["lon"].min()), "E": float(ds["lon"].max())} + except Exception: + return None + + +def _load_movement(config: MultidimAnnotationConfig, ds_for_bbox: Optional[xr.Dataset] = None) -> pd.DataFrame: + path = _require_file(config.movement_csv, "Movement CSV") + df = pd.read_csv(path) + required = [config.id_col, config.time_col, config.lat_col, config.lon_col, config.height_col] + missing = [c for c in required if c not in df.columns] + if missing: + raise ValueError(f"Movement CSV is missing required column(s): {', '.join(missing)}") + if config.selected_ids: + ids = {str(x) for x in config.selected_ids} + df = df[df[config.id_col].astype(str).isin(ids)].copy() + df[config.time_col] = parse_movebank_timestamp_series(df[config.time_col], config.time_col) + df[config.lat_col] = pd.to_numeric(df[config.lat_col], errors="coerce") + df[config.lon_col] = pd.to_numeric(df[config.lon_col], errors="coerce") + df[config.height_col] = pd.to_numeric(df[config.height_col], errors="coerce") + df = df.dropna(subset=[config.time_col, config.lat_col, config.lon_col, config.height_col]).copy() + bbox = config.bbox or (_dataset_bounds(ds_for_bbox) if ds_for_bbox is not None and not config.boundary_path else None) + df = _filter_boundary(df, config.lat_col, config.lon_col, config.boundary_path, bbox) + return df.reset_index(drop=True) + + +def _time_range(ds: xr.Dataset) -> Tuple[pd.Timestamp, pd.Timestamp]: + vals = pd.to_datetime(ds["time"].values) + return pd.Timestamp(vals.min()), pd.Timestamp(vals.max()) + + +def _prefilter_time(df: pd.DataFrame, config: MultidimAnnotationConfig, required_datasets: Sequence[xr.Dataset]) -> pd.DataFrame: + starts, ends = [], [] + for ds in required_datasets: + if ds is None or "time" not in ds: + continue + try: + start, end = _time_range(ds) + starts.append(start) + ends.append(end) + except Exception: + pass + if not starts or not ends: + return df + start, end = max(starts), min(ends) + if start > end: + raise ValueError(f"Input NetCDF files have no overlapping time range: latest start={start}, earliest end={end}") + return df[df[config.time_col].between(start, end)].copy() + + +def _surface_height(config: MultidimAnnotationConfig, terrain: float) -> float: + if np.isfinite(terrain): + return float(terrain) + float(config.surface_height_agl_m) + return float(config.surface_height_agl_m) + + +def _slice_coord(ds: xr.Dataset, coord: str, low: float, high: float) -> xr.Dataset: + if coord not in ds.coords and coord not in ds.variables: + return ds + vals = np.asarray(ds[coord].values, dtype=float) + if vals.size < 2: + return ds + lo, hi = float(min(low, high)), float(max(low, high)) + try: + if vals[0] <= vals[-1]: + return ds.sel({coord: slice(lo, hi)}) + return ds.sel({coord: slice(hi, lo)}) + except Exception: + return ds + +def _slice_time_with_bracket(ds: xr.Dataset, tmin: pd.Timestamp, tmax: pd.Timestamp) -> xr.Dataset: + """ + Subset dataset by movement time range, but keep one neighbouring NetCDF + timestep before and after the movement range when possible. + + This is important for linear time interpolation: if movement timestamps fall + between two NetCDF timesteps, a strict slice(tmin, tmax) may remove the + required bracketing timesteps. + """ + if "time" not in ds.coords and "time" not in ds.variables: + return ds + + try: + times = pd.to_datetime(ds["time"].values) + if len(times) == 0: + return ds + + arr = times.to_numpy(dtype="datetime64[ns]") + start = np.datetime64(pd.Timestamp(tmin).to_datetime64()).astype("datetime64[ns]") + end = np.datetime64(pd.Timestamp(tmax).to_datetime64()).astype("datetime64[ns]") + + # Assumes time is sorted ascending, which should normally be true after open_dataset(). + left = int(np.searchsorted(arr, start, side="left")) + right = int(np.searchsorted(arr, end, side="right")) - 1 + + left = max(0, left - 1) + right = min(len(arr) - 1, right + 1) + + if right < left: + return ds + + return ds.isel({"time": slice(left, right + 1)}) + except Exception: + try: + return ds.sel({"time": slice(tmin, tmax)}) + except Exception: + return ds + +def subset_dataset_to_movement(ds: Optional[xr.Dataset], movement: pd.DataFrame, config: MultidimAnnotationConfig, buffer_deg: float = 1.0) -> Optional[xr.Dataset]: + if ds is None or movement.empty: + return ds + out = ds + try: + if "time" in out.coords and config.time_col in movement.columns: + tmin = pd.Timestamp(movement[config.time_col].min()) + tmax = pd.Timestamp(movement[config.time_col].max()) + out = _slice_time_with_bracket(out, tmin, tmax) + except Exception: + pass + try: + if "lat" in out.coords and config.lat_col in movement.columns: + lat_min = float(movement[config.lat_col].min()) - buffer_deg + lat_max = float(movement[config.lat_col].max()) + buffer_deg + out = _slice_coord(out, "lat", lat_min, lat_max) + except Exception: + pass + try: + if "lon" in out.coords and config.lon_col in movement.columns: + lon_vals = np.asarray(out["lon"].values, dtype=float) + lon_series = movement[config.lon_col].astype(float).map(lambda x: _wrap_lon(float(x), lon_vals)) + lon_min = float(lon_series.min()) - buffer_deg + lon_max = float(lon_series.max()) + buffer_deg + if lon_max - lon_min < 350: + out = _slice_coord(out, "lon", lon_min, lon_max) + except Exception: + pass + return out + + +def _geopotential_cache_key(t: pd.Timestamp, lat: float, lon: float, time_method: str, fixed_lat: Optional[float], fixed_lon: Optional[float]) -> Tuple[Any, ...]: + lat_key = round(float(lat if fixed_lat is None else fixed_lat), 6) + lon_key = round(float(lon if fixed_lon is None else fixed_lon), 6) + t_key = pd.Timestamp(t).to_datetime64() + return (t_key, lat_key, lon_key, str(time_method)) + + +def _get_geopotential_profile_cached( + ds_geo: xr.Dataset, + geo_name: str, + row: pd.Series, + config: MultidimAnnotationConfig, + variable_type: VariableType, + cache: Optional[Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]]] = None, + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, +) -> Tuple[np.ndarray, np.ndarray]: + t = pd.Timestamp(row[config.time_col]) + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + key = _geopotential_cache_key(t, lat, lon, time_method, fixed_lat, fixed_lon) + if cache is not None and key in cache: + return cache[key] + levels_geo, heights = sample_geopotential_profile( + ds_geo, + geo_name, + t=t, + lat=lat, + lon=lon, + units_override=config.geopotential_units, + convert_geopotential_to_height=config.convert_geopotential_to_height, + gravity_constant=config.gravity_constant, + time_method=time_method, + spatial_method="nearest", + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + if cache is not None: + cache[key] = (levels_geo, heights) + return levels_geo, heights + + +def _sample_var_at_cell( + ds_var: xr.Dataset, + var_name: str, + ds_geo: xr.Dataset, + geo_name: str, + row: pd.Series, + config: MultidimAnnotationConfig, + target_height: float, + terrain: float, + variable_type: VariableType, + surface_value: Optional[float] = None, + fixed_lat: Optional[float] = None, + fixed_lon: Optional[float] = None, + geo_cache: Optional[Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]]] = None, +) -> Tuple[float, Dict[str, Any]]: + t = pd.Timestamp(row[config.time_col]) + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + spatial_xr: Literal["nearest", "linear"] = "nearest" + + levels_geo, heights = _get_geopotential_profile_cached( + ds_geo, + geo_name, + row, + config, + variable_type, + geo_cache, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + levels_var, values = sample_level_profile( + ds_var, + var_name, + t=t, + lat=lat, + lon=lon, + time_method=time_method, + spatial_method=spatial_xr, + fixed_lat=fixed_lat, + fixed_lon=fixed_lon, + ) + if len(levels_geo) != len(levels_var) or not np.array_equal(np.asarray(levels_geo), np.asarray(levels_var)): + geo_map = {str(k): v for k, v in zip(levels_geo, heights)} + heights_for_values = np.asarray([geo_map.get(str(lev), np.nan) for lev in levels_var], dtype=float) + else: + heights_for_values = np.asarray(heights, dtype=float) + sv = surface_value if (config.use_surface_as_lower_anchor and variable_type == "continuous") else None + sh = _surface_height(config, terrain) if sv is not None else None + return vertical_sample( + levels_var, + heights_for_values, + values, + target_height, + method=config.vertical_method, + variable_type=variable_type, + surface_value=sv, + surface_height_m=sh, + allow_extrapolation=config.allow_vertical_extrapolation, + ) + + +def _sample_multilevel( + ds_var: xr.Dataset, + var_name: str, + ds_geo: xr.Dataset, + geo_name: str, + row: pd.Series, + config: MultidimAnnotationConfig, + target_height: float, + terrain: float, + variable_type: VariableType, + surface_value: Optional[float] = None, + geo_cache: Optional[Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]]] = None, +) -> Tuple[float, Dict[str, Any]]: + if variable_type == "categorical" or config.spatial_method == "nearest": + return _sample_var_at_cell( + ds_var, + var_name, + ds_geo, + geo_name, + row, + config, + target_height, + terrain, + variable_type, + surface_value, + geo_cache=geo_cache, + ) + + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + glat = np.asarray(ds_var["lat"].values, dtype=float) + glon = np.asarray(ds_var["lon"].values, dtype=float) + lon_adj = _wrap_lon(lon, glon) + k = max(2, int(config.smoothing_k)) + samples, dists, last_diag = [], [], {} + for ii, jj in _k_nearest_indices(glat, glon, lat, lon_adj, k): + flat, flon = float(glat[ii]), float(glon[jj]) + val, diag = _sample_var_at_cell( + ds_var, + var_name, + ds_geo, + geo_name, + row, + config, + target_height, + terrain, + variable_type, + surface_value, + fixed_lat=flat, + fixed_lon=flon, + geo_cache=geo_cache, + ) + samples.append(val) + dists.append(float(np.hypot(flat - lat, flon - lon_adj))) + last_diag = diag + return _idw(samples, dists), last_diag + + +def _bearing_deg(lat1: float, lon1: float, lat2: float, lon2: float) -> float: + if not all(np.isfinite([lat1, lon1, lat2, lon2])): + return np.nan + phi1, phi2 = math.radians(lat1), math.radians(lat2) + dlon = math.radians(lon2 - lon1) + x = math.sin(dlon) * math.cos(phi2) + y = math.cos(phi1) * math.sin(phi2) - math.sin(phi1) * math.cos(phi2) * math.cos(dlon) + return (math.degrees(math.atan2(x, y)) + 360.0) % 360.0 + + +def add_track_bearing(df: pd.DataFrame, id_col: str, time_col: str, lat_col: str, lon_col: str, heading_col: Optional[str], heading_source: str) -> pd.DataFrame: + out = df.copy() + if heading_source == "column" and heading_col and heading_col in out.columns: + out["track_bearing_deg"] = pd.to_numeric(out[heading_col], errors="coerce") + return out + out["track_bearing_deg"] = np.nan + sort_cols = [id_col, time_col] if id_col in out.columns else [time_col] + work = out.sort_values(sort_cols) + groups = work.groupby(id_col, dropna=False, sort=False) if id_col in work.columns else [(None, work)] + for _, group in groups: + idxs = list(group.index) + for i, idx in enumerate(idxs): + if i < len(idxs) - 1: + nxt = idxs[i + 1] + b = _bearing_deg(out.at[idx, lat_col], out.at[idx, lon_col], out.at[nxt, lat_col], out.at[nxt, lon_col]) + elif i > 0: + prv = idxs[i - 1] + b = _bearing_deg(out.at[prv, lat_col], out.at[prv, lon_col], out.at[idx, lat_col], out.at[idx, lon_col]) + else: + b = np.nan + out.at[idx, "track_bearing_deg"] = b + return out + + +def add_wind_metrics(df: pd.DataFrame, u_col: str = "td_u_at_height", v_col: str = "td_v_at_height") -> pd.DataFrame: + out = df.copy() + u = pd.to_numeric(out[u_col], errors="coerce") if u_col in out.columns else pd.Series(np.nan, index=out.index) + v = pd.to_numeric(out[v_col], errors="coerce") if v_col in out.columns else pd.Series(np.nan, index=out.index) + out["wind_speed_ms"] = np.sqrt(u * u + v * v) + wind_to = (np.degrees(np.arctan2(u, v)) + 360.0) % 360.0 + out["wind_to_direction_deg"] = wind_to + out["wind_from_direction_deg"] = (wind_to + 180.0) % 360.0 + if "track_bearing_deg" in out.columns: + theta = np.radians(pd.to_numeric(out["track_bearing_deg"], errors="coerce")) + out["wind_support_ms"] = u * np.sin(theta) + v * np.cos(theta) + out["crosswind_ms"] = u * np.cos(theta) - v * np.sin(theta) + return out + + +def _var_type(var: str, spec: DatasetSpec) -> VariableType: + return "categorical" if var in set(spec.categorical or []) else "continuous" + +def _fast_required_dims(da: xr.DataArray, required: Sequence[str], variable: str) -> xr.DataArray: + """ + Prepare a DataArray for fast numpy sampling. + + Only singleton non-standard dimensions are dropped. If a variable has + genuinely extra dimensions, fast mode refuses it and the caller can fall + back to the old xarray-based algorithm. + """ + out = da + + for dim in list(out.dims): + if dim not in {"time", "level", "lat", "lon"}: + if int(out.sizes.get(dim, 0)) == 1: + out = out.isel({dim: 0}, drop=True) + else: + raise ValueError( + f"Fast mode does not support variable '{variable}' with extra dimension '{dim}'." + ) + + missing = [d for d in required if d not in out.dims] + if missing: + raise ValueError( + f"Fast mode requires variable '{variable}' to have dimensions: {required}. " + f"Missing: {missing}. Actual dims: {list(out.dims)}" + ) + + return out + + +def _fast_open_4d(ds: xr.Dataset, variable: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + Return variable as numpy array with shape: + time, level, lat, lon + """ + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in dataset.") + + da = _fast_required_dims(ds[variable], ("time", "level", "lat", "lon"), variable) + da = da.transpose("time", "level", "lat", "lon") + + arr = np.asarray(da.load().values, dtype=float) + times = pd.to_datetime(da["time"].values).to_numpy(dtype="datetime64[ns]") + levels = np.asarray(da["level"].values) + lat = np.asarray(da["lat"].values, dtype=float) + lon = np.asarray(da["lon"].values, dtype=float) + + return arr, times, levels, lat, lon + + +def _fast_open_3d(ds: xr.Dataset, variable: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + Return surface variable as numpy array with shape: + time, lat, lon + """ + if variable not in ds.data_vars: + raise ValueError(f"Variable '{variable}' not found in dataset.") + + da = _fast_required_dims(ds[variable], ("time", "lat", "lon"), variable) + da = da.transpose("time", "lat", "lon") + + arr = np.asarray(da.load().values, dtype=float) + times = pd.to_datetime(da["time"].values).to_numpy(dtype="datetime64[ns]") + lat = np.asarray(da["lat"].values, dtype=float) + lon = np.asarray(da["lon"].values, dtype=float) + + return arr, times, lat, lon + + +def _fast_time_index_weight( + times: np.ndarray, + target: pd.Timestamp, + *, + method: Literal["nearest", "linear"], +) -> Tuple[int, int, float]: + """ + Return t0, t1, weight for fast temporal sampling. + + For nearest: + value = arr[t0] + For linear: + value = arr[t0] * (1 - w) + arr[t1] * w + """ + if len(times) == 0: + raise ValueError("Cannot sample dataset with empty time coordinate.") + + arr = np.asarray(times).astype("datetime64[ns]") + target64 = np.datetime64(pd.Timestamp(target).to_datetime64()).astype("datetime64[ns]") + + if method == "nearest" or len(arr) == 1: + diffs = np.abs(arr.astype("int64") - target64.astype("int64")) + idx = int(np.nanargmin(diffs)) + return idx, idx, 0.0 + + right = int(np.searchsorted(arr, target64, side="left")) + + if right <= 0: + return 0, 0, 0.0 + if right >= len(arr): + last = len(arr) - 1 + return last, last, 0.0 + if arr[right] == target64: + return right, right, 0.0 + + left = right - 1 + t0 = arr[left].astype("int64") + t1 = arr[right].astype("int64") + tt = target64.astype("int64") + + if t1 == t0: + return left, right, 0.0 + + w = float((tt - t0) / (t1 - t0)) + return left, right, w + + +def _fast_nearest_lat_lon_indices( + lat_values: np.ndarray, + lon_values: np.ndarray, + lat: float, + lon: float, +) -> Tuple[int, int]: + lon_adj = _wrap_lon(float(lon), lon_values) + yi = _nearest_index(np.asarray(lat_values, dtype=float), float(lat)) + xi = _nearest_index(np.asarray(lon_values, dtype=float), lon_adj) + return int(yi), int(xi) + + +def _fast_sample_4d_profile( + arr: np.ndarray, + times: np.ndarray, + lat_values: np.ndarray, + lon_values: np.ndarray, + *, + t: pd.Timestamp, + lat: float, + lon: float, + variable_type: VariableType, +) -> np.ndarray: + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + t0, t1, w = _fast_time_index_weight(times, t, method=time_method) + yi, xi = _fast_nearest_lat_lon_indices(lat_values, lon_values, lat, lon) + + v0 = arr[t0, :, yi, xi] + if t1 == t0 or w == 0.0: + return np.asarray(v0, dtype=float) + + v1 = arr[t1, :, yi, xi] + return np.asarray(v0 * (1.0 - w) + v1 * w, dtype=float) + + +def _fast_sample_3d_value( + arr: np.ndarray, + times: np.ndarray, + lat_values: np.ndarray, + lon_values: np.ndarray, + *, + t: pd.Timestamp, + lat: float, + lon: float, + variable_type: VariableType, +) -> float: + time_method: Literal["nearest", "linear"] = "nearest" if variable_type == "categorical" else "linear" + t0, t1, w = _fast_time_index_weight(times, t, method=time_method) + yi, xi = _fast_nearest_lat_lon_indices(lat_values, lon_values, lat, lon) + + v0 = float(arr[t0, yi, xi]) + if t1 == t0 or w == 0.0: + return v0 + + v1 = float(arr[t1, yi, xi]) + return float(v0 * (1.0 - w) + v1 * w) + + +def _fast_heights_for_variable_levels( + geo_levels: np.ndarray, + geo_heights: np.ndarray, + var_levels: np.ndarray, +) -> np.ndarray: + """ + Match geopotential-derived heights to variable levels. + + If levels are identical and in the same order, return heights directly. + Otherwise match by string representation of the level coordinate. + """ + if len(geo_levels) == len(var_levels) and np.array_equal(np.asarray(geo_levels), np.asarray(var_levels)): + return np.asarray(geo_heights, dtype=float) + + geo_map = {str(k): v for k, v in zip(geo_levels, geo_heights)} + return np.asarray([geo_map.get(str(lev), np.nan) for lev in var_levels], dtype=float) + +_RHO_AIR = 1.225 # kg/m³, standard sea-level air density +_CP_AIR = 1005.0 # J/(kg·K), specific heat of dry air at constant pressure +_G = 9.80665 # m/s², gravitational acceleration +_R_DRY = 287.05 # J/(kg·K), specific gas constant for dry air + +def compute_thermal_updraft_w_star( + surface_heat_flux_wm2: float, + boundary_layer_height_m: float, + temperature_2m_K: float, +) -> float: + """ + Deardorff convective velocity scale w* (m/s). + + Standard measure of thermal updraft intensity used by Movebank ENV-DATA + and described in Bohrer et al. (2012, Ecology Letters). + + w* = (g/T * (H / (rho * cp)) * zi) ^ (1/3) + + Args: + surface_heat_flux_wm2: ERA5 surface sensible heat flux (W/m²). + Positive = surface heating the atmosphere = uplift. + boundary_layer_height_m: ERA5 planetary boundary layer height (m). + temperature_2m_K: ERA5 2-metre temperature (K), used as a proxy + for surface potential temperature. + + Returns: + w* in m/s. Returns 0.0 when heat flux <= 0 (no convection). + Returns NaN when any input is missing or physically invalid. + """ + H = float(surface_heat_flux_wm2) + zi = float(boundary_layer_height_m) + T = float(temperature_2m_K) + + if not (np.isfinite(H) and np.isfinite(zi) and np.isfinite(T)): + return np.nan + if T <= 0.0 or zi <= 0.0: + return np.nan + if H <= 0.0: + return 0.0 # stable or neutral atmosphere: no convective uplift + + H_kinematic = H / (_RHO_AIR * _CP_AIR) # kinematic heat flux (K·m/s) + w_star_cubed = (_G / T) * H_kinematic * zi + return float(w_star_cubed ** (1.0 / 3.0)) + +def _finalize_and_save_annotation_output(out: pd.DataFrame, config: MultidimAnnotationConfig) -> pd.DataFrame: + """Finalize derived metrics, save CSV output, and return the annotated DataFrame.""" + + if config.derive_wind_speed_direction or config.derive_wind_support_crosswind: + if "td_u_at_height" in out.columns and "td_v_at_height" in out.columns: + if config.derive_wind_support_crosswind: + out = add_track_bearing( + out, + config.id_col, + config.time_col, + config.lat_col, + config.lon_col, + config.heading_col, + config.heading_source, + ) + out = add_wind_metrics(out) + + # --- Vertical motion: convert ERA5 omega (Pa/s) to geometric w (m/s) --- + if config.derive_vertical_motion and "td_w_at_height" in out.columns: + omega = pd.to_numeric(out["td_w_at_height"], errors="coerce") + + has_temp = "td_temperature_at_height" in out.columns + # The matched pressure level is stored in hPa by vertical_sample diagnostics. + # Column name pattern: _matched_level_height_m is the height; + # we need the pressure level itself which vertical_sample stores as matched_level. + level_col = next( + (c for c in out.columns if c.endswith("_matched_level") and "height" not in c), + None, + ) + has_level = level_col is not None + + if has_temp and has_level: + T_K = pd.to_numeric(out["td_temperature_at_height"], errors="coerce") + P_Pa = pd.to_numeric(out[level_col], errors="coerce") * 100.0 # hPa -> Pa + rho = P_Pa / (_R_DRY * T_K) + out["vertical_motion_ms"] = -omega / (rho * _G) + out["vertical_motion_omega_Pa_s"] = omega + out["vertical_motion_note"] = ( + "vertical_motion_ms: ERA5 omega (Pa/s) converted to geometric " + "vertical velocity (m/s) via w = -omega / (rho * g), " + "rho = P / (R_dry * T). Positive = upward." + ) + else: + # Fall back to a standard-atmosphere approximation (rho ~ 1.0 kg/m³) + # valid roughly between 1 and 10 km altitude. + out["vertical_motion_ms"] = -omega / (1.0 * _G) + out["vertical_motion_omega_Pa_s"] = omega + out["vertical_motion_note"] = ( + "WARNING: vertical_motion_ms estimated with rho=1.0 kg/m3 " + "(standard atmosphere approximation). For accurate conversion " + "provide temperature and pressure level data. Positive = upward." + ) + + # --- Thermal updraft: Deardorff w* --- + if config.derive_thermal_proxy: + has_shf = "surface_surface_sensible_heat_flux" in out.columns + has_blh = "surface_boundary_layer_height" in out.columns + has_t2m = "surface_2m_temperature" in out.columns + + if has_shf and has_blh and has_t2m: + out["thermal_updraft_w_star_ms"] = [ + compute_thermal_updraft_w_star( + row["surface_surface_sensible_heat_flux"], + row["surface_boundary_layer_height"], + row["surface_2m_temperature"], + ) + for _, row in out.iterrows() + ] + out["thermal_updraft_note"] = ( + "Deardorff convective velocity scale w* (m/s). " + "Positive = convective uplift available. " + "Method: Bohrer et al. 2012 / Movebank ENV-DATA." + ) + elif "td_temperature_at_height" in out.columns: + out["temperature_at_height_K"] = out["td_temperature_at_height"] + out["thermal_updraft_note"] = ( + "WARNING: w* not computed. Requires surface variables: " + "surface_sensible_heat_flux, boundary_layer_height, 2m_temperature. " + "Storing raw temperature at flight height instead." + ) + + # --- Orographic uplift: Bohrer et al. 2012 --- + if config.derive_orographic_uplift and config.dem_file: + has_u10 = "surface_u_component_of_wind_10m" in out.columns + has_v10 = "surface_v_component_of_wind_10m" in out.columns + + if has_u10 and has_v10: + slopes_aspects = [ + compute_dem_slope_aspect( + config.dem_file, + float(row[config.lat_col]), + float(row[config.lon_col]), + ) + for _, row in out.iterrows() + ] + out["orographic_uplift_ms"] = [ + compute_orographic_uplift( + float(row["surface_u_component_of_wind_10m"]), + float(row["surface_v_component_of_wind_10m"]), + sa[0], + sa[1], + ) + for (_, row), sa in zip(out.iterrows(), slopes_aspects) + ] + out["orographic_uplift_note"] = ( + "Wo = V_surface * sin(slope) * cos(wind_from - aspect) (m/s). " + "Method: Bohrer et al. 2012 / Movebank ENV-DATA. " + "Positive = updraft on windward slope." + ) + else: + out["orographic_uplift_ms"] = np.nan + out["orographic_uplift_note"] = ( + "WARNING: orographic uplift not computed. " + "Add surface_u_component_of_wind_10m and " + "surface_v_component_of_wind_10m as surface variables." + ) + + output_path = _as_path(config.output_csv) + output_path.parent.mkdir(parents=True, exist_ok=True) + out.to_csv(output_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") + + if config.save_per_individual and config.id_col in out.columns: + per_dir = output_path.with_suffix("").parent / f"{output_path.stem}_by_individual" + per_dir.mkdir(parents=True, exist_ok=True) + for ident, group in out.groupby(config.id_col, dropna=False): + safe = re.sub(r"[^\w\-]", "_", str(ident).strip()) or "unknown" + group.to_csv( + per_dir / f"{safe}.csv", + index=False, + encoding="utf-8-sig", + date_format="%Y-%m-%d %H:%M:%S", + ) + + return out + + +class _FastSampler: + """ + Samples pre-loaded numpy arrays directly. + Used for spatial nearest-neighbour mode. + """ + + def __init__( + self, + ds_geo: xr.Dataset, + ds_main: xr.Dataset, + ds_surface: Optional[xr.Dataset], + ds_u: Optional[xr.Dataset], + ds_v: Optional[xr.Dataset], + ds_w: Optional[xr.Dataset], + ds_t: Optional[xr.Dataset], + config: MultidimAnnotationConfig, + ) -> None: + self._config = config + self._geo_arr, self._geo_times, self._geo_levels, self._geo_lat, self._geo_lon = ( + _fast_open_4d(ds_geo, config.geopotential_variable) + ) + + if config.convert_geopotential_to_height: + units = ( + config.geopotential_units + or ds_geo[config.geopotential_variable].attrs.get("units") + or "" + ).lower().replace("**", "^").replace("/", " ") + is_height = units.strip() in {"m", "meter", "meters", "metre", "metres"} + if not is_height: + self._geo_arr = self._geo_arr / float(config.gravity_constant) + + main_vars = _unique( + config.multilevel.continuous, + config.multilevel.categorical, + config.multilevel.variables, + ) + self._main_arrays: Dict[str, tuple] = { + var: _fast_open_4d(ds_main, var) for var in main_vars + } + + self._surface_arrays: Dict[str, tuple] = {} + if config.surface and ds_surface is not None: + surface_vars = _unique( + config.surface.continuous, + config.surface.categorical, + config.surface.variables, + ) + for var in surface_vars: + self._surface_arrays[var] = _fast_open_3d(ds_surface, var) + + self._component_arrays: List[Tuple[str, OptionalComponentSpec, tuple]] = [] + for label, ds, spec in ( + ("u", ds_u, config.u_component), + ("v", ds_v, config.v_component), + ("w", ds_w, config.w_component), + ("temperature", ds_t, config.temperature_component), + ): + if ds is not None and spec.variable: + self._component_arrays.append((label, spec, _fast_open_4d(ds, spec.variable))) + + def geo_profile(self, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> np.ndarray: + return _fast_sample_4d_profile( + self._geo_arr, self._geo_times, self._geo_lat, self._geo_lon, + t=t, lat=lat, lon=lon, variable_type=vtype, + ) + + def var_profile(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> Tuple[np.ndarray, np.ndarray]: + arr, times, levels, lat_vals, lon_vals = self._main_arrays[var] + values = _fast_sample_4d_profile( + arr, times, lat_vals, lon_vals, + t=t, lat=lat, lon=lon, variable_type=vtype, + ) + return levels, values + + def component_profile(self, label: str, t: pd.Timestamp, lat: float, lon: float) -> Tuple[np.ndarray, np.ndarray]: + for lbl, _spec, arr_info in self._component_arrays: + if lbl == label: + arr, times, levels, lat_vals, lon_vals = arr_info + values = _fast_sample_4d_profile( + arr, times, lat_vals, lon_vals, + t=t, lat=lat, lon=lon, variable_type="continuous", + ) + return levels, values + raise KeyError(label) + + def surface_value(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> float: + s_arr, s_times, s_lat, s_lon = self._surface_arrays[var] + return _fast_sample_3d_value( + s_arr, s_times, s_lat, s_lon, + t=t, lat=lat, lon=lon, variable_type=vtype, + ) + + @property + def geo_levels(self) -> np.ndarray: + return self._geo_levels + + @property + def component_specs(self) -> List[Tuple[str, OptionalComponentSpec]]: + return [(label, spec) for label, spec, _ in self._component_arrays] + + +class _XarraySampler: + """ + Samples via xarray .sel/.interp on every point. + Used as fallback or for IDW spatial mode. + """ + + def __init__( + self, + ds_geo: xr.Dataset, + ds_main: xr.Dataset, + ds_surface: Optional[xr.Dataset], + ds_u: Optional[xr.Dataset], + ds_v: Optional[xr.Dataset], + ds_w: Optional[xr.Dataset], + ds_t: Optional[xr.Dataset], + config: MultidimAnnotationConfig, + ) -> None: + self._config = config + self._ds_geo = ds_geo + self._ds_main = ds_main + self._ds_surface = ds_surface + self._ds_components: Dict[str, Tuple[xr.Dataset, OptionalComponentSpec]] = {} + for label, ds, spec in ( + ("u", ds_u, config.u_component), + ("v", ds_v, config.v_component), + ("w", ds_w, config.w_component), + ("temperature", ds_t, config.temperature_component), + ): + if ds is not None and spec.variable: + self._ds_components[label] = (ds, spec) + + self._geo_cache: Dict[Tuple[Any, ...], Tuple[np.ndarray, np.ndarray]] = {} + + def geo_profile(self, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> np.ndarray: + _, heights = self._get_geo_cached(t, lat, lon, vtype) + return heights + + def _get_geo_cached(self, t, lat, lon, vtype) -> Tuple[np.ndarray, np.ndarray]: + key = _geopotential_cache_key(t, lat, lon, "nearest" if vtype == "categorical" else "linear", None, None) + if key not in self._geo_cache: + levels, heights = sample_geopotential_profile( + self._ds_geo, + self._config.geopotential_variable, + t=t, lat=lat, lon=lon, + units_override=self._config.geopotential_units, + convert_geopotential_to_height=self._config.convert_geopotential_to_height, + gravity_constant=self._config.gravity_constant, + time_method="nearest" if vtype == "categorical" else "linear", + spatial_method="nearest", + ) + self._geo_cache[key] = (levels, heights) + return self._geo_cache[key] + + def var_profile(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> Tuple[np.ndarray, np.ndarray]: + return sample_level_profile( + self._ds_main, var, + t=t, lat=lat, lon=lon, + time_method="nearest" if vtype == "categorical" else "linear", + spatial_method="nearest", + ) + + def component_profile(self, label: str, t: pd.Timestamp, lat: float, lon: float) -> Tuple[np.ndarray, np.ndarray]: + ds, spec = self._ds_components[label] + return sample_level_profile( + ds, spec.variable, + t=t, lat=lat, lon=lon, + time_method="linear", + spatial_method="nearest", + ) + + def surface_value(self, var: str, t: pd.Timestamp, lat: float, lon: float, vtype: VariableType) -> float: + return sample_surface_value( + self._ds_surface, var, + t=t, lat=lat, lon=lon, + variable_type=vtype, + spatial_method=self._config.spatial_method, + ) + + @property + def geo_levels(self) -> np.ndarray: + return np.asarray(self._ds_geo["level"].values) + + @property + def component_specs(self) -> List[Tuple[str, OptionalComponentSpec]]: + return [(label, spec) for label, (_, spec) in self._ds_components.items()] + + def sample_at_height( + self, + var: str, + t: pd.Timestamp, + lat: float, + lon: float, + vtype: VariableType, + target_height: float, + terrain: float, + surface_value: Optional[float], + config: MultidimAnnotationConfig, + ) -> Tuple[float, Dict[str, Any]]: + """IDW spatial sampling: delegates to _sample_multilevel which handles + k-nearest neighbours and inverse distance weighting internally.""" + row = pd.Series({ + config.time_col: t, + config.lat_col: lat, + config.lon_col: lon, + }) + return _sample_multilevel( + self._ds_main, + var, + self._ds_geo, + config.geopotential_variable, + row, + config, + target_height, + terrain, + vtype, + surface_value, + geo_cache=self._geo_cache, + ) + +def _process_single_point( + idx: Any, + row: pd.Series, + sampler: Union[_FastSampler, _XarraySampler], + config: MultidimAnnotationConfig, + out: pd.DataFrame, + main_vars: List[str], + surface_vars: List[str], + surface_anchor_var: Optional[str], +) -> Dict[str, Any]: + """ + Annotate one movement point. Writes results into `out` in-place. + Returns diagnostics dict for this point. + """ + warnings_for_point: List[str] = [] + t = pd.Timestamp(row[config.time_col]) + lat = float(row[config.lat_col]) + lon = float(row[config.lon_col]) + + # --- Terrain --- + if config.dem_file: + terrain, dem_warning = sample_dem_elevation(config.dem_file, lat, lon) + if dem_warning and dem_warning != "dem_not_provided": + warnings_for_point.append(dem_warning) + else: + terrain, dem_warning = np.nan, "" + out.at[idx, "terrain_elevation_m"] = terrain + + # --- Height conversion --- + height_msl, hdiag = compute_orthometric_height( + row[config.height_col], lat, lon, + height_reference=config.height_reference, + geoid_mode=config.geoid_mode, + constant_geoid_undulation_m=config.constant_geoid_undulation_m, + geoid_grid_path=config.geoid_grid_path, + terrain_elevation_m=terrain, + ) + out.at[idx, "height_msl_m"] = height_msl + if np.isfinite(height_msl) and np.isfinite(terrain): + out.at[idx, "height_agl_m"] = height_msl - terrain + if hdiag.get("height_conversion_warning"): + warnings_for_point.append(str(hdiag["height_conversion_warning"])) + + row_diag: Dict[str, Any] = {**hdiag, "dem_warning": dem_warning} + + # --- Surface variables --- + surface_values: Dict[str, Any] = {} + for var in surface_vars: + if config.surface is None: + continue + vtype = _var_type(var, config.surface) + try: + sval = sampler.surface_value(var, t, lat, lon, vtype) + out.at[idx, f"surface_{var}"] = sval + surface_values[var] = sval + except Exception as exc: + warnings_for_point.append(f"surface_{var}_failed:{exc}") + + # Geopotential profile + geo_profile_cache: Dict[VariableType, np.ndarray] = {} + + def _get_geo_heights(vtype: VariableType) -> np.ndarray: + if vtype not in geo_profile_cache: + geo_profile_cache[vtype] = sampler.geo_profile(t, lat, lon, vtype) + return geo_profile_cache[vtype] + + # --- Main multilevel variables --- + + for var in main_vars: + vtype = _var_type(var, config.multilevel) + try: + anchor = surface_values.get(surface_anchor_var) if ( + surface_anchor_var and vtype == "continuous" and config.use_surface_as_lower_anchor + ) else None + + if hasattr(sampler, "sample_at_height") and config.spatial_method != "nearest": + # IDW: horizontal and vertical sampling together + val, diag = sampler.sample_at_height( + var, t, lat, lon, vtype, height_msl, terrain, anchor, config, + ) + else: + # Nearest: first profile, then vertical interpolation + var_levels, values = sampler.var_profile(var, t, lat, lon, vtype) + geo_heights = _get_geo_heights(vtype) + heights_for_values = _fast_heights_for_variable_levels( + sampler.geo_levels, geo_heights, var_levels, + ) + sh = _surface_height(config, terrain) if anchor is not None else None + val, diag = vertical_sample( + var_levels, heights_for_values, values, height_msl, + method=config.vertical_method, + variable_type=vtype, + surface_value=anchor, + surface_height_m=sh, + allow_extrapolation=config.allow_vertical_extrapolation, + ) + + out.at[idx, f"td_{var}_at_height"] = val + for k, v in diag.items(): + row_diag[f"{var}_{k}"] = v + if diag.get("vertical_warning"): + warnings_for_point.append(f"{var}:{diag['vertical_warning']}") + + except Exception as exc: + warnings_for_point.append(f"{var}_sampling_failed:{exc}") + + # --- Wind/temperature components --- + for label, _spec in sampler.component_specs: + try: + comp_levels, values = sampler.component_profile(label, t, lat, lon) + geo_heights = _get_geo_heights("continuous") + heights_for_values = _fast_heights_for_variable_levels( + sampler.geo_levels, geo_heights, comp_levels, + ) + val, diag = vertical_sample( + comp_levels, heights_for_values, values, height_msl, + method=config.vertical_method, + variable_type="continuous", + allow_extrapolation=config.allow_vertical_extrapolation, + ) + out.at[idx, f"td_{label}_at_height"] = val + if config.keep_diagnostics: + for k, v in diag.items(): + row_diag[f"{label}_{k}"] = v + if diag.get("vertical_warning"): + warnings_for_point.append(f"{label}:{diag['vertical_warning']}") + + except Exception as exc: + warnings_for_point.append(f"{label}_sampling_failed:{exc}") + + out.at[idx, "annotation_warning"] = ";".join(w for w in warnings_for_point if w) + return row_diag + +def run_multidimensional_annotation(config: MultidimAnnotationConfig) -> pd.DataFrame: + config.vertical_method = _normalize_vertical_method(config.vertical_method) # type: ignore[assignment] + config.spatial_method = _normalize_spatial_method(config.spatial_method) # type: ignore[assignment] + if config.spatial_method == "nearest": + config.smoothing_k = 1 + + ds_geo = open_dataset(config.geopotential_file, config.coord_spec) + ds_main = open_dataset(config.multilevel.path, config.coord_spec) + ds_surface = open_dataset(config.surface.path, config.coord_spec) if config.surface else None + ds_u = open_dataset(config.u_component.path, config.coord_spec) if config.u_component.is_enabled() else None + ds_v = open_dataset(config.v_component.path, config.coord_spec) if config.v_component.is_enabled() else None + ds_w = open_dataset(config.w_component.path, config.coord_spec) if config.w_component.is_enabled() else None + ds_t = open_dataset(config.temperature_component.path, config.coord_spec) if config.temperature_component.is_enabled() else None + + datasets_to_close = [ds_geo, ds_main, ds_surface, ds_u, ds_v, ds_w, ds_t] + try: + movement = _load_movement(config, ds_main) + movement = _prefilter_time(movement, config, [ds_geo, ds_main]) + + ds_geo = subset_dataset_to_movement(ds_geo, movement, config) + ds_main = subset_dataset_to_movement(ds_main, movement, config) + ds_surface = subset_dataset_to_movement(ds_surface, movement, config) if ds_surface is not None else None + ds_u = subset_dataset_to_movement(ds_u, movement, config) if ds_u is not None else None + ds_v = subset_dataset_to_movement(ds_v, movement, config) if ds_v is not None else None + ds_w = subset_dataset_to_movement(ds_w, movement, config) if ds_w is not None else None + ds_t = subset_dataset_to_movement(ds_t, movement, config) if ds_t is not None else None + main_vars = _unique( + config.multilevel.continuous, + config.multilevel.categorical, + config.multilevel.variables, + ) + surface_vars = _unique( + config.surface.continuous, + config.surface.categorical, + config.surface.variables, + ) if config.surface else [] + surface_anchor_var = ( + config.surface.continuous[0] + if (config.surface and config.surface.continuous) + else None + ) + + if config.spatial_method == "nearest": + try: + sampler = _FastSampler( + ds_geo, ds_main, ds_surface, + ds_u, ds_v, ds_w, ds_t, config, + ) + except Exception as exc: + LOGGER.warning( + "FastSampler init failed (%s), falling back to XarraySampler.", + exc, + exc_info=True, + ) + sampler = _XarraySampler( + ds_geo, ds_main, ds_surface, + ds_u, ds_v, ds_w, ds_t, config, + ) + else: + sampler = _XarraySampler( + ds_geo, ds_main, ds_surface, + ds_u, ds_v, ds_w, ds_t, config, + ) + + out = movement.copy() + for col in ("terrain_elevation_m", "height_msl_m", "height_agl_m"): + out[col] = np.nan + out["annotation_warning"] = "" + for var in main_vars: + out[f"td_{var}_at_height"] = np.nan + for var in surface_vars: + out[f"surface_{var}"] = np.nan + for label, _spec in sampler.component_specs: + out[f"td_{label}_at_height"] = np.nan + + diag_rows: List[Dict[str, Any]] = [] + for idx, row in out.iterrows(): + row_diag = _process_single_point( + idx, row, sampler, config, out, + main_vars, surface_vars, surface_anchor_var, + ) + diag_rows.append(row_diag) + + if config.keep_diagnostics and diag_rows: + diag_df = pd.DataFrame(diag_rows, index=out.index) + for col in diag_df.columns: + if col not in out.columns: + out[col] = diag_df[col] + + return _finalize_and_save_annotation_output(out, config) + finally: + for ds in datasets_to_close: + try: + if ds is not None: + ds.close() + except Exception: + pass + + +def _list_or_empty(values: Optional[Sequence[str]]) -> List[str]: + return list(values or []) + + +def run_multidimensional_annotation_from_paths( + *, + movement_csv: Union[str, Path], + output_csv: Union[str, Path], + id_col: str, + time_col: str, + lat_col: str, + lon_col: str, + height_col: str, + geopotential_file: Union[str, Path], + geopotential_variable: str, + multilevel_var_file: Union[str, Path], + multilevel_variable: Optional[str] = None, + multilevel_continuous_vars: Optional[Sequence[str]] = None, + multilevel_categorical_vars: Optional[Sequence[str]] = None, + surface_var_file: Optional[Union[str, Path]] = None, + surface_variable: Optional[str] = None, + surface_continuous_vars: Optional[Sequence[str]] = None, + surface_categorical_vars: Optional[Sequence[str]] = None, + selected_ids: Optional[Sequence[str]] = None, + boundary_path: Optional[Union[str, Path]] = None, + bbox: Optional[Dict[str, float]] = None, + coord_spec: Optional[Dict[str, Optional[str]]] = None, + nc_time_var: Optional[str] = None, + nc_lat_var: Optional[str] = None, + nc_lon_var: Optional[str] = None, + nc_level_var: Optional[str] = None, + spatial_interpolation_method: str = "Nearest neighbor", + smoothing_k: int = 1, + vertical_matching_method: str = "Nearest geopotential-height level", + geopotential_units: Optional[str] = "m2 s-2", + convert_geopotential_to_height: bool = True, + use_surface_as_lower_anchor: bool = True, + surface_anchor_height_agl_m: float = 2.0, + dem_file: Optional[Union[str, Path]] = None, + save_per_individual: bool = False, + keep_diagnostics: bool = True, + height_reference: HeightReference = "ellipsoidal", + geoid_mode: GeoidMode = "geographiclib", + constant_geoid_undulation_m: float = 0.0, + geoid_grid_path: Optional[Union[str, Path]] = None, + u_file: Optional[Union[str, Path]] = None, + u_variable: Optional[str] = None, + v_file: Optional[Union[str, Path]] = None, + v_variable: Optional[str] = None, + w_file: Optional[Union[str, Path]] = None, + w_variable: Optional[str] = None, + temperature_file: Optional[Union[str, Path]] = None, + temperature_variable: Optional[str] = None, + derive_wind_speed_direction: bool = False, + derive_wind_support_crosswind: bool = False, + derive_vertical_motion: bool = False, + derive_thermal_proxy: bool = False, + derive_orographic_uplift: bool = False, + heading_col: Optional[str] = None, + heading_source: Literal["compute", "column"] = "compute", +) -> pd.DataFrame: + if coord_spec is None: + coord_spec = {"time": nc_time_var, "lat": nc_lat_var, "lon": nc_lon_var, "level": nc_level_var} + coord_spec = {k: v for k, v in (coord_spec or {}).items() if v} + + ml_cont = _list_or_empty(multilevel_continuous_vars) + ml_cat = _list_or_empty(multilevel_categorical_vars) + if not ml_cont and not ml_cat and multilevel_variable: + ml_cont = [multilevel_variable] + ml_vars = _unique(ml_cont, ml_cat) + if not ml_vars: + raise ValueError("No multilevel variables selected.") + + surf_cont = _list_or_empty(surface_continuous_vars) + surf_cat = _list_or_empty(surface_categorical_vars) + if not surf_cont and not surf_cat and surface_variable: + surf_cont = [surface_variable] + surf_vars = _unique(surf_cont, surf_cat) + + surface = None + if surface_var_file and surf_vars: + surface = DatasetSpec(surface_var_file, variables=surf_vars, continuous=surf_cont, categorical=surf_cat, label_prefix="surface") + + config = MultidimAnnotationConfig( + movement_csv=movement_csv, + output_csv=output_csv, + id_col=id_col, + time_col=time_col, + lat_col=lat_col, + lon_col=lon_col, + height_col=height_col, + selected_ids=list(selected_ids or []) if selected_ids is not None else None, + boundary_path=boundary_path, + bbox=bbox, + coord_spec=coord_spec, + geopotential_file=geopotential_file, + geopotential_variable=geopotential_variable, + geopotential_units=geopotential_units, + convert_geopotential_to_height=convert_geopotential_to_height, + multilevel=DatasetSpec(multilevel_var_file, variables=ml_vars, continuous=ml_cont, categorical=ml_cat, label_prefix="td"), + surface=surface, + spatial_method=_normalize_spatial_method(spatial_interpolation_method), + smoothing_k=int(smoothing_k or 1), + vertical_method=_normalize_vertical_method(vertical_matching_method), + use_surface_as_lower_anchor=use_surface_as_lower_anchor, + surface_height_agl_m=float(surface_anchor_height_agl_m), + dem_file=dem_file, + save_per_individual=save_per_individual, + keep_diagnostics=keep_diagnostics, + height_reference=height_reference, + geoid_mode=geoid_mode, + constant_geoid_undulation_m=float(constant_geoid_undulation_m or 0.0), + geoid_grid_path=geoid_grid_path, + u_component=OptionalComponentSpec(u_file, u_variable, "u"), + v_component=OptionalComponentSpec(v_file, v_variable, "v"), + w_component=OptionalComponentSpec(w_file, w_variable, "w"), + temperature_component=OptionalComponentSpec(temperature_file, temperature_variable, "temperature"), + derive_wind_speed_direction=derive_wind_speed_direction, + derive_wind_support_crosswind=derive_wind_support_crosswind, + derive_vertical_motion=derive_vertical_motion, + derive_thermal_proxy=derive_thermal_proxy, + derive_orographic_uplift=derive_orographic_uplift, + heading_col=heading_col, + heading_source=heading_source, + ) + return run_multidimensional_annotation(config) + + +def run_three_dim_annotation(*args: Any, **kwargs: Any) -> pd.DataFrame: + return run_multidimensional_annotation(*args, **kwargs) + + +def sample_era5_at_height(*args: Any, **kwargs: Any) -> pd.DataFrame: + raise NotImplementedError("Use run_multidimensional_annotation_from_paths() instead.") + + +__all__ = [ + "G0", + "DatasetSpec", + "OptionalComponentSpec", + "MultidimAnnotationConfig", + "parse_movebank_timestamp_series", + "open_dataset", + "sample_surface_value", + "sample_level_profile", + "sample_geopotential_profile", + "vertical_sample", + "sample_dem_elevation", + "compute_orthometric_height", + "add_track_bearing", + "add_wind_metrics", + "run_multidimensional_annotation", + "run_multidimensional_annotation_from_paths", + "run_three_dim_annotation", +] \ No newline at end of file diff --git a/ecodata/nc_builder_functions.py b/ecodata/nc_builder_functions.py new file mode 100644 index 0000000..bdc5d7d --- /dev/null +++ b/ecodata/nc_builder_functions.py @@ -0,0 +1,930 @@ +""" +Backend functions for NCBuilder_App. + +This module is intentionally UI-free: +- no Panel imports +- no ECODATA template imports +- no register_view imports + +It can be imported safely from ecodata.__init__ or from nc_builder_app.py. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple +import json +import re + +import numpy as np +import pandas as pd +import xarray as xr + + +NETCDF_EXTENSIONS = (".nc", ".nc4", ".cdf", ".netcdf") + + +@dataclass +class NCBuildConfig: + files: List[str] + combine_mode: str + target_variable: str + output_variable_name: str + lat_variable: str + lon_variable: str + # Optional multi-variable mode + target_variables: Optional[List[str]] = None + time_source: str = "From NetCDF time coordinate" + time_variable: Optional[str] = None + time_regex: str = r"(\d{8})" + time_format: str = "%Y%m%d" + time_table_path: Optional[str] = None + level_source: str = "From NetCDF coordinate" + level_variable: Optional[str] = None + level_regex: str = r"level(\d+)" + level_table_path: Optional[str] = None + output_level_coord_name: str = "level" + level_units: str = "hPa" + + bbox: Optional[Dict[str, float]] = None + start_time: Optional[str] = None + end_time: Optional[str] = None + + output_path: str = "standardized_output.nc" + use_dask_chunks: bool = True + chunking_mode: str = "auto" + manual_chunks: Optional[Dict[str, int]] = None + enable_compression: bool = True + convert_longitude_to_180: bool = True + # "auto" means: xarray default -> h5netcdf -> netcdf4 -> scipy. + open_engine: str = "auto" + #! for ECODATA/MATLAB compatibility + use_modis_time_encoding: bool = True + + +def list_netcdf_files(folder: str | Path) -> List[Path]: + folder = Path(folder).expanduser() + if not folder.exists() or not folder.is_dir(): + return [] + + allowed = {ext.lower() for ext in NETCDF_EXTENSIONS} + + files = [ + p for p in folder.iterdir() + if p.is_file() and p.suffix.lower() in allowed + ] + + return sorted(files, key=lambda p: p.name.lower()) + + +def _guess_name(candidates: Sequence[str], preferred: Sequence[str]) -> Optional[str]: + if not candidates: + return None + lower_map = {str(c).lower(): str(c) for c in candidates} + for p in preferred: + if p.lower() in lower_map: + return lower_map[p.lower()] + for c in candidates: + cl = str(c).lower() + if any(p.lower() in cl for p in preferred): + return str(c) + return None + + +def _safe_open_for_scan( + path: str | Path, + preferred_engine: Optional[str] = None, +) -> Tuple[xr.Dataset, str]: + """ + Open dataset for metadata scanning. + + For scanning, avoid chunks="auto" because it can fail when dask is not installed + and is unnecessary for reading names, dimensions, and coordinate ranges. + """ + return _open_dataset_auto( + path, + {"decode_times": True}, + preferred_engine=preferred_engine, + ) + + +def scan_netcdf_files( + files: Sequence[str | Path], + max_scan: int = 10, + use_dask_chunks: bool = False, + chunking_mode: str = "auto", + manual_chunks: Optional[Dict[str, int]] = None, +) -> Dict[str, Any]: + existing = [Path(f).expanduser() for f in files if Path(f).expanduser().exists()] + warnings: List[str] = [] + engine_by_file: Dict[str, str] = {} + + if not existing: + return { + "files": [], + "variables": [], + "coords": [], + "dims": [], + "all_names": [], + "suggested_time": None, + "suggested_lat": None, + "suggested_lon": None, + "suggested_level": None, + "time_min": None, + "time_max": None, + "scanned_count": 0, + "engine_by_file": {}, + "warnings": ["No existing NetCDF files were found."], + } + + variables = set() + coords = set() + dims = set() + time_min = None + time_max = None + + scanned_count = 0 + for path in existing[:max_scan]: + try: + ds, engine_used = _safe_open_for_scan(path) + engine_by_file[path.name] = engine_used + with ds: + scanned_count += 1 + variables.update(map(str, ds.data_vars)) + coords.update(map(str, ds.coords)) + dims.update(map(str, ds.dims)) + + all_vars = list(map(str, ds.variables)) + tname = _guess_name( + all_vars, + ["time", "valid_time", "forecast_time", "verification_time", "datetime", "date", "t", "Time"], + ) + if tname and tname in ds.variables: + cur_min, cur_max, calendar_info = _safe_time_range(ds[tname].values) + if cur_min is not None and cur_max is not None: + if time_min is None or str(cur_min) < str(time_min): + time_min = cur_min + if time_max is None or str(cur_max) > str(time_max): + time_max = cur_max + + if calendar_info: + warnings.append( + f"Time in {path.name} uses non-pandas calendar/time type " + f"`{calendar_info}`; preview time range is shown as string." + ) + except Exception as exc: + warnings.append(f"Could not scan {path.name}: {exc}") + + all_names = sorted(variables | coords | dims) + + suggested_time = _guess_name( + all_names, + ["time", "valid_time", "forecast_time", "verification_time", "datetime", "date", "t", "Time"], + ) + suggested_lat = _guess_name(all_names, ["lat", "latitude", "Latitude", "y"]) + suggested_lon = _guess_name(all_names, ["lon", "longitude", "Longitude", "long", "x"]) + suggested_level = _guess_name( + all_names, + ["level", "pressure_level", "isobaricInhPa", "isobaric_in_hPa", "plev", "lev", "height", "altitude"], + ) + + return { + "files": [str(f) for f in existing], + "variables": sorted(variables), + "coords": sorted(coords), + "dims": sorted(dims), + "all_names": all_names, + "suggested_time": suggested_time, + "suggested_lat": suggested_lat, + "suggested_lon": suggested_lon, + "suggested_level": suggested_level, + "time_min": str(time_min) if time_min is not None else None, + "time_max": str(time_max) if time_max is not None else None, + "scanned_count": scanned_count, + "warnings": warnings, + "engine_by_file": engine_by_file + } + +def _open_dataset_auto( + path: str | Path, + open_kwargs: Optional[Dict[str, Any]] = None, + preferred_engine: Optional[str] = None, +) -> Tuple[xr.Dataset, str]: + """ + Open a NetCDF file with automatic engine fallback. + + Engine strategy: + - preferred_engine, if explicitly provided and not "auto"/"default"; + - xarray default engine; + - h5netcdf; + - netcdf4; + - scipy. + + Returns + ------- + ds : xr.Dataset + Opened dataset. + engine_used : str + Engine name used for opening. "default" means xarray default engine. + """ + path = Path(path).expanduser() + open_kwargs = dict(open_kwargs or {}) + + preferred_engine = preferred_engine or "auto" + + engines: List[Optional[str]] = [] + + if preferred_engine not in ("auto", "default", None): + engines.append(str(preferred_engine)) + + engines.extend([None, "h5netcdf", "netcdf4", "scipy"]) + + tried: List[str] = [] + last_exc: Optional[Exception] = None + + for engine in engines: + engine_label = engine or "default" + if engine_label in tried: + continue + tried.append(engine_label) + + kwargs = dict(open_kwargs) + if engine is not None: + kwargs["engine"] = engine + else: + kwargs.pop("engine", None) + + try: + ds = xr.open_dataset(path, **kwargs) + return ds, engine_label + except Exception as exc: + last_exc = exc + + raise OSError( + f"Could not open NetCDF file {path.name!r}. " + f"Tried engines: {tried}. Last error: {last_exc}" + ) + +def validate_build_config(config: NCBuildConfig) -> Tuple[bool, List[str], List[str]]: + errors: List[str] = [] + warnings: List[str] = [] + + files = [Path(f).expanduser() for f in config.files] + existing = [f for f in files if f.exists()] + if not existing: + errors.append("No existing NetCDF files were selected.") + + selected_vars = _selected_target_variables(config) + if not selected_vars: + errors.append("Target variable is not selected.") + + # output_variable_name is required only in single-variable mode. + # In multi-variable mode original source variable names are preserved. + if len(selected_vars) == 1 and not config.output_variable_name: + errors.append("Output variable name is empty.") + + if not config.lat_variable: + errors.append("Latitude variable is not selected.") + + if not config.lon_variable: + errors.append("Longitude variable is not selected.") + + if config.time_source == "From NetCDF time coordinate" and not config.time_variable: + errors.append("Time source is NetCDF coordinate, but no time variable is selected.") + + if config.time_source == "From filename": + if not config.time_regex: + errors.append("Time source is filename, but time regex is empty.") + if not config.time_format: + errors.append("Time source is filename, but time format is empty.") + + if config.time_source == "Manual table" and not config.time_table_path: + errors.append("Time source is manual table, but no time table file is selected.") + + if config.combine_mode in ("By level", "By time and level"): + if config.level_source == "From NetCDF coordinate" and not config.level_variable: + errors.append("Combine mode requires level handling, but no level variable is selected.") + if config.level_source == "From filename" and not config.level_regex: + errors.append("Level source is filename, but level regex is empty.") + if config.level_source == "Manual table" and not config.level_table_path: + errors.append("Level source is manual table, but no level table file is selected.") + + if config.bbox is not None: + try: + south = float(config.bbox["south"]) + north = float(config.bbox["north"]) + west = float(config.bbox["west"]) + east = float(config.bbox["east"]) + if south >= north: + errors.append("Bounding box is invalid: South must be smaller than North.") + if west >= east: + errors.append("Bounding box is invalid: West must be smaller than East.") + except Exception: + errors.append("Bounding box is enabled but contains invalid values.") + + output_path = Path(config.output_path).expanduser() + if not output_path.name: + errors.append("Output filename is empty.") + if output_path.suffix.lower() not in (".nc", ".nc4"): + warnings.append("Output file does not end with .nc or .nc4.") + + valid_engines = {"auto", "default", "h5netcdf", "netcdf4", "scipy"} + if config.open_engine not in valid_engines: + errors.append( + f"Invalid open_engine={config.open_engine!r}. " + f"Expected one of: {sorted(valid_engines)}." + ) + + if config.open_engine == "auto": + warnings.append( + "NetCDF open engine will be selected automatically: default -> h5netcdf -> netcdf4 -> scipy." + ) + else: + warnings.append(f"NetCDF open engine preference: {config.open_engine}.") + + if config.convert_longitude_to_180: + warnings.append("Longitudes will be converted to the -180..180 convention when possible.") + if config.use_modis_time_encoding: + warnings.append("Time coordinate will use MATLAB/MODIS-compatible encoding when possible.") + return len(errors) == 0, errors, warnings + + +def _load_lookup_table(path: Optional[str], value_col: str) -> Dict[str, Any]: + if not path: + return {} + table_path = Path(path).expanduser() + if not table_path.exists(): + raise FileNotFoundError(f"Manual table not found: {table_path}") + + df = pd.read_csv(table_path) + if "name" not in df.columns or value_col not in df.columns: + raise ValueError(f"Manual table must contain columns: name, {value_col}") + + return {str(row["name"]): row[value_col] for _, row in df.iterrows()} + + +def _lookup_by_filename(path: Path, lookup: Dict[str, Any]) -> Optional[Any]: + if not lookup: + return None + name = path.name + for key, value in lookup.items(): + if str(key) == name or str(key) in name: + return value + return None + + +def _parse_from_filename(path: Path, regex: str, cast=float, time_format: Optional[str] = None) -> Any: + m = re.search(regex, path.name) + if not m: + raise ValueError(f"Pattern {regex!r} did not match file name {path.name!r}") + token = m.group(1) if m.groups() else m.group(0) + if time_format: + return pd.to_datetime(token, format=time_format) + return cast(token) + + +def _rename_if_needed(ds: xr.Dataset, old: Optional[str], new: str) -> xr.Dataset: + if not old or old == "None": + return ds + if old == new: + return ds + if old in ds.variables or old in ds.dims or old in ds.coords: + return ds.rename({old: new}) + return ds + +def _normalize_time_coord_if_possible(ds: xr.Dataset) -> xr.Dataset: + """ + Convert the time coordinate to pandas datetime only when this is safe. + + Standard calendars are usually convertible to pandas datetime64. + Non-standard calendars, such as julian, noleap, or 360_day, may be decoded + by xarray as cftime objects. pandas.to_datetime() cannot convert them + reliably, so they are preserved unchanged. + """ + if "time" not in ds.coords: + return ds + + values = ds["time"].values + + try: + converted = pd.to_datetime(values) + except Exception: + return ds + + return ds.assign_coords(time=converted) + +def _safe_time_range(values): + """ + Return time min/max for preview without failing on cftime calendars. + + pandas can handle standard datetime-like values, but not all cftime calendars + such as Julian, noleap, or 360_day. For cftime objects, use native min/max + and convert to string for display. + """ + if values is None or len(values) == 0: + return None, None, None + + try: + vals = pd.to_datetime(values) + if len(vals) == 0: + return None, None, None + return pd.Timestamp(vals.min()), pd.Timestamp(vals.max()), None + except Exception: + try: + cur_min = min(values) + cur_max = max(values) + calendar_type = type(values[0]).__name__ + return cur_min, cur_max, calendar_type + except Exception as exc: + return None, None, f"unreadable time values: {exc}" + +def _subset_bbox_1d_coords( + ds: xr.Dataset, + bbox: Dict[str, float], + source_name: str = "dataset", +) -> xr.Dataset: + """ + Subset a dataset by bbox using 1D lat/lon coordinate values. + + This does not require lat/lon to be xarray index coordinates. + It works when lat and lon are 1D coordinates, even if their dimension names + are not exactly 'lat' and 'lon'. + """ + if "lat" not in ds.coords or "lon" not in ds.coords: + return ds + + lat = ds["lat"] + lon = ds["lon"] + + if lat.ndim != 1 or lon.ndim != 1: + raise ValueError( + f"Bounding box subset currently supports only 1D lat/lon coordinates in {source_name}. " + f"Got lat.ndim={lat.ndim}, lon.ndim={lon.ndim}." + ) + + south = float(bbox["south"]) + north = float(bbox["north"]) + west = float(bbox["west"]) + east = float(bbox["east"]) + + lat_dim = lat.dims[0] + lon_dim = lon.dims[0] + + lat_values = np.asarray(lat.values) + lon_values = np.asarray(lon.values) + + lat_mask = (lat_values >= south) & (lat_values <= north) + lon_mask = (lon_values >= west) & (lon_values <= east) + + lat_idx = np.where(lat_mask)[0] + lon_idx = np.where(lon_mask)[0] + + if lat_idx.size == 0: + raise ValueError( + f"Bounding box produced an empty latitude subset in {source_name}. " + f"Requested south/north=({south}, {north}); " + f"available lat range=({float(np.nanmin(lat_values))}, {float(np.nanmax(lat_values))})." + ) + + if lon_idx.size == 0: + raise ValueError( + f"Bounding box produced an empty longitude subset in {source_name}. " + f"Requested west/east=({west}, {east}); " + f"available lon range=({float(np.nanmin(lon_values))}, {float(np.nanmax(lon_values))})." + ) + + return ds.isel({ + lat_dim: lat_idx, + lon_dim: lon_idx, + }) + +def _json_safe(value: Any) -> Any: + """ + Convert common numpy/pandas/xarray/cftime objects to JSON-safe values. + + This is mainly used for writing the manifest file. NetCDF encodings may + contain numpy dtypes or other objects that json.dump cannot serialize. + """ + if value is None: + return None + + if isinstance(value, (str, int, float, bool)): + return value + + if isinstance(value, Path): + return str(value) + + if isinstance(value, dict): + return {str(k): _json_safe(v) for k, v in value.items()} + + if isinstance(value, (list, tuple, set)): + return [_json_safe(v) for v in value] + + if isinstance(value, np.generic): + return value.item() + + if isinstance(value, np.dtype): + return str(value) + + if isinstance(value, pd.Timestamp): + return value.isoformat() + + # Handles pandas/numpy extension dtypes such as Float64DType. + if hasattr(value, "name") and value.__class__.__name__.endswith("DType"): + return str(value) + + # Handles cftime objects and any remaining non-JSON-native objects. + return str(value) + +def _selected_target_variables(config: NCBuildConfig) -> List[str]: + """ + Return target variables selected for output. + + New multi-variable mode uses config.target_variables. + Legacy single-variable mode uses config.target_variable. + """ + selected = [ + str(v) for v in (config.target_variables or []) + if v not in (None, "", "None") + ] + + if selected: + # Preserve order and remove duplicates. + unique: List[str] = [] + seen = set() + for v in selected: + if v not in seen: + seen.add(v) + unique.append(v) + return unique + + if config.target_variable not in (None, "", "None"): + return [str(config.target_variable)] + + return [] + +def _standardize_one_dataset( + path: Path, + config: NCBuildConfig, + time_lookup: Dict[str, Any], + level_lookup: Dict[str, Any], +) -> xr.Dataset: + open_kwargs = {"decode_times": True} + if config.use_dask_chunks: + if config.chunking_mode == "auto": + open_kwargs["chunks"] = "auto" + elif config.manual_chunks: + # Only keep positive chunks; xarray will ignore unknown dims poorly, + # so this is applied later after dims are known if needed. + open_kwargs["chunks"] = {k: int(v) for k, v in config.manual_chunks.items() if int(v) > 0} + + try: + ds, engine_used = _open_dataset_auto( + path, + open_kwargs, + preferred_engine=config.open_engine, + ) + except Exception: + # Fallback without dask/chunks. + ds, engine_used = _open_dataset_auto( + path, + {"decode_times": True}, + preferred_engine=config.open_engine, + ) + + selected_vars = _selected_target_variables(config) + missing_vars = [ + var for var in selected_vars + if var not in ds.data_vars and var not in ds.variables + ] + + if missing_vars: + ds.close() + raise ValueError( + f"Target variable(s) {missing_vars!r} not found in {path.name}" + ) + + # Rename coordinates/dims to ECODATA/CF-style names. + ds = _rename_if_needed(ds, config.lat_variable, "lat") + ds = _rename_if_needed(ds, config.lon_variable, "lon") + + if config.time_source == "From NetCDF time coordinate": + ds = _rename_if_needed(ds, config.time_variable, "time") + + if config.level_source == "From NetCDF coordinate" and config.level_variable not in (None, "", "None"): + ds = _rename_if_needed(ds, config.level_variable, "level") + + # Build output dataset. + # Single-variable mode preserves the output_variable_name behaviour. + # Multi-variable mode keeps original variable names to avoid ambiguous renaming. + if len(selected_vars) == 1: + old_name = selected_vars[0] + new_name = config.output_variable_name or old_name + + da = ds[old_name] + if new_name != old_name: + da = da.rename(new_name) + + out = da.to_dataset() + else: + out = ds[selected_vars].copy() + + out.attrs["source_open_engine"] = engine_used + out.attrs["source_file"] = str(path) + + # Add time if it comes from filename/table and is not already a dimension. + if config.time_source == "From filename": + t = _parse_from_filename(path, config.time_regex, time_format=config.time_format) + if "time" not in out.dims: + out = out.expand_dims(time=[pd.Timestamp(t)]) + else: + out = out.assign_coords(time=pd.to_datetime(out["time"].values)) + elif config.time_source == "Manual table": + value = _lookup_by_filename(path, time_lookup) + if value is None: + raise ValueError(f"No DateTime entry found in time table for {path.name}") + t = pd.to_datetime(value) + if "time" not in out.dims: + out = out.expand_dims(time=[pd.Timestamp(t)]) + elif "time" in out.coords: + out = _normalize_time_coord_if_possible(out) + + # Add level if it comes from filename/table and is not already a dimension. + if config.level_source == "From filename": + level_value = _parse_from_filename(path, config.level_regex, cast=float) + if "level" not in out.dims: + out = out.expand_dims(level=[level_value]) + elif config.level_source == "Manual table": + value = _lookup_by_filename(path, level_lookup) + if value is None: + raise ValueError(f"No level entry found in level table for {path.name}") + level_value = float(value) + if "level" not in out.dims: + out = out.expand_dims(level=[level_value]) + + # Keep only expected data + standard coords where possible. + if "lat" not in out.variables and "lat" not in out.coords: + raise ValueError(f"Could not standardize latitude coordinate in {path.name}") + if "lon" not in out.variables and "lon" not in out.coords: + raise ValueError(f"Could not standardize longitude coordinate in {path.name}") + + # Convert lon 0..360 to -180..180 when lon is 1D. + if config.convert_longitude_to_180 and "lon" in out.coords: + lon = out["lon"] + try: + if lon.ndim == 1 and float(lon.max()) > 180: + new_lon = ((lon + 180) % 360) - 180 + out = out.assign_coords(lon=new_lon).sortby("lon") + except Exception: + pass + + # Sort common dims. + for dim in ("time", "level", "lat", "lon"): + if dim in out.coords: + try: + out = out.sortby(dim) + except Exception: + pass + + # Spatial subset by bbox. + if config.bbox: + out = _subset_bbox_1d_coords( + out, + config.bbox, + source_name=path.name, + ) + + # Time subset. + # In "By time" mode, the selected input files define the time range. + # This avoids unsafe pandas Timestamp slicing for cftime calendars. + if ( + config.combine_mode != "By time" + and "time" in out.coords + and (config.start_time or config.end_time) + ): + time_values = out["time"].values + first_time = time_values[0] if len(time_values) else None + + if first_time is not None and first_time.__class__.__module__.startswith("cftime"): + # Skip cftime slicing until a dedicated cftime-aware subset is implemented. + pass + else: + start = pd.to_datetime(config.start_time) if config.start_time else None + end = pd.to_datetime(config.end_time) if config.end_time else None + out = out.sel(time=slice(start, end)) + + return out + + +def _check_grid_compatibility(datasets: Sequence[xr.Dataset]) -> None: + if not datasets: + raise ValueError("No datasets to combine.") + + ref = datasets[0] + for coord in ("lat", "lon"): + if coord not in ref.coords: + continue + ref_vals = ref[coord].values + for i, ds in enumerate(datasets[1:], start=2): + if coord not in ds.coords: + raise ValueError(f"Dataset #{i} is missing coordinate {coord!r}") + vals = ds[coord].values + if ref_vals.shape != vals.shape or not np.allclose(ref_vals, vals, equal_nan=True): + raise ValueError( + f"Grid incompatibility for coordinate {coord!r}: " + f"dataset #1 shape {ref_vals.shape}, dataset #{i} shape {vals.shape}" + ) + + +def _combine_datasets(datasets: Sequence[xr.Dataset], config: NCBuildConfig) -> xr.Dataset: + _check_grid_compatibility(datasets) + + try: + combined = xr.combine_by_coords(list(datasets), combine_attrs="override") + except Exception: + # Fallback based on selected mode. + if config.combine_mode == "By time": + combined = xr.concat(list(datasets), dim="time", combine_attrs="override") + elif config.combine_mode == "By level": + combined = xr.concat(list(datasets), dim="level", combine_attrs="override") + else: + # combine_by_coords is the safer option for time+level; + # if it failed, the layouts are probably ambiguous. + raise + + for dim in ("time", "level", "lat", "lon"): + if dim in combined.coords: + try: + combined = combined.sortby(dim) + except Exception: + pass + + return combined + + +def _apply_cf_metadata(ds: xr.Dataset, config: NCBuildConfig) -> xr.Dataset: + if "lat" in ds.coords: + ds["lat"].attrs.update({ + "standard_name": "latitude", + "long_name": "latitude", + "units": "degrees_north", + "axis": "Y", + }) + + if "lon" in ds.coords: + ds["lon"].attrs.update({ + "standard_name": "longitude", + "long_name": "longitude", + "units": "degrees_east", + "axis": "X", + }) + + if "time" in ds.coords: + ds["time"].attrs.update({ + "standard_name": "time", + "long_name": "time", + "axis": "T", + }) + + if "level" in ds.coords: + attrs = { + "long_name": "vertical level", + "axis": "Z", + "units": config.level_units, + } + if config.level_units in ("hPa", "Pa"): + attrs.update({ + "standard_name": "air_pressure", + "positive": "down", + }) + elif config.level_units == "m": + attrs.update({ + "standard_name": "height", + "positive": "up", + }) + ds["level"].attrs.update(attrs) + + ds.attrs.update({ + "title": "ECODATA standardized NetCDF", + "Conventions": "CF-1.8", + "history": f"Created by ECODATA NCBuilder", + "source_files_count": len(config.files), + "combine_mode": config.combine_mode, + }) + + return ds + +def _apply_time_encoding(ds: xr.Dataset, config: NCBuildConfig) -> xr.Dataset: + """ + Apply optional time encoding for ECODATA/MATLAB compatibility. + + This does not change the actual time coordinate values in memory. + It only controls how the time coordinate is written to the NetCDF file. + """ + if not config.use_modis_time_encoding: + return ds + + if "time" not in ds.coords: + return ds + + ds["time"].encoding.update({ + "units": "days since 2000-01-01", + "calendar": "julian", + }) + + return ds + +def _encoding_for(ds: xr.Dataset, config: NCBuildConfig) -> Dict[str, Dict[str, Any]]: + encoding: Dict[str, Dict[str, Any]] = {} + + # Preserve explicit time encoding if it was set by _apply_time_encoding(). + if "time" in ds.coords and ds["time"].encoding: + time_encoding = {} + for key in ("units", "calendar", "dtype"): + if key in ds["time"].encoding: + time_encoding[key] = ds["time"].encoding[key] + if time_encoding: + encoding["time"] = time_encoding + + if not config.enable_compression: + return encoding + + for var in ds.data_vars: + encoding[var] = { + "zlib": True, + "complevel": 4, + } + + return encoding + + +def build_standardized_netcdf(config: NCBuildConfig) -> Dict[str, Any]: + ok, errors, warnings = validate_build_config(config) + if not ok: + raise ValueError("Invalid NCBuildConfig: " + "; ".join(errors)) + + output_path = Path(config.output_path).expanduser() + output_path.parent.mkdir(parents=True, exist_ok=True) + + time_lookup = _load_lookup_table(config.time_table_path, "DateTime") if config.time_source == "Manual table" else {} + level_lookup = _load_lookup_table(config.level_table_path, "level") if config.level_source == "Manual table" else {} + + datasets: List[xr.Dataset] = [] + processed_files: List[str] = [] + + for f in config.files: + path = Path(f).expanduser() + if not path.exists(): + continue + ds = _standardize_one_dataset(path, config, time_lookup, level_lookup) + datasets.append(ds) + processed_files.append(str(path)) + + if not datasets: + raise ValueError("No datasets were successfully opened.") + + combined = _combine_datasets(datasets, config) + combined = _apply_cf_metadata(combined, config) + combined = _apply_time_encoding(combined, config) + + encoding = _encoding_for(combined, config) + combined.to_netcdf(output_path, encoding=encoding) + + engine_by_file = {} + for ds in datasets: + source_file = ds.attrs.get("source_file") + source_engine = ds.attrs.get("source_open_engine") + if source_file and source_engine: + engine_by_file[Path(source_file).name] = source_engine + + # Close source datasets to release file handles. + for ds in datasets: + try: + ds.close() + except Exception: + pass + + manifest_path = output_path.with_suffix(output_path.suffix + ".manifest.json") + manifest = { + "output_path": str(output_path), + "manifest_path": str(manifest_path), + "processed_files": _json_safe(processed_files), + "engine_by_file": _json_safe(engine_by_file), + "config": _json_safe(asdict(config)), + "warnings": _json_safe(warnings), + "output_dims": _json_safe({k: int(v) for k, v in combined.sizes.items()}), + "output_variables": _json_safe(list(map(str, combined.data_vars))), + "output_coords": _json_safe(list(map(str, combined.coords))), + "time_encoding": _json_safe(dict(combined["time"].encoding)) if "time" in combined.coords else {}, + } + + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2, ensure_ascii=False, default=str) + + try: + combined.close() + except Exception: + pass + + return manifest diff --git a/ecodata/presence_functions.py b/ecodata/presence_functions.py new file mode 100644 index 0000000..881ecc0 --- /dev/null +++ b/ecodata/presence_functions.py @@ -0,0 +1,934 @@ +""" +presence data preparation backend functions. + +- VettingOptions +- AggregationOptions +- aggregate_ebird_to_files +- export_tracks_from_aggregated_counts +- read_species_from_agg_counts +""" + +from __future__ import annotations + +import datetime as dt +import gzip +import io +import json +import os +import tempfile +import zipfile +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence + +import numpy as np +import pandas as pd + +try: + import geopandas as gpd + from shapely.geometry import Point, box +except Exception: # pragma: no cover + gpd = None + Point = None + box = None + + +@dataclass +class VettingOptions: + """ + Vetting/filter options for eBird EBD + Sampling Event data. + + UI mapping: + - require_reviewed: filters by REVIEWED when present + - require_approved: filters by APPROVED when present + - require_all_species_reported: filters by ALL SPECIES REPORTED when present + - allowed_protocols: matches PROTOCOL TYPE (preferred) or PROTOCOL CODE if present + - exclude_incidental_historical: excludes Incidental/Historical when PROTOCOL TYPE present + - duration/distance bounds: applied when sampling effort fields are present + - require_valid_coords: removes rows with missing/invalid lat/lon + - clip_counts_above: clips numeric counts after parsing; 0 disables clipping + """ + + require_reviewed: bool = False + require_approved: bool = False + require_all_species_reported: bool = False + + allowed_protocols: Optional[List[str]] = None + exclude_incidental_historical: bool = True + + duration_min_minutes: int = 0 + duration_max_minutes: int = 600 + + distance_min_km: float = 0.0 + distance_max_km: float = 50.0 + + require_valid_coords: bool = True + + clip_counts_above: int = 0 + + +@dataclass +class AggregationOptions: + """ + Time aggregation options. + + Aggregation is performed in bins of N days starting from start_date. + + Spatial aggregation: + - grid_step_deg == 0: keep original observation coordinates + - grid_step_deg > 0: assign observations to regular lon/lat grid nodes + + Notes: + - treat_x_as_one: if True, OBSERVATION COUNT == 'X' is treated as 1. + If False, 'X' is treated as missing and then filled to 1.0 for presence-like behavior. + """ + + start_date: dt.date + end_date: dt.date + step_days: int = 7 + grid_step_deg: float = 0.0 + treat_x_as_one: bool = True + + +def _truthy(series: pd.Series) -> pd.Series: + """ + Interpret typical eBird truthy values. + """ + s = series.fillna("").astype(str).str.strip().str.upper() + return s.isin(["1", "TRUE", "T", "YES", "Y"]) + + +def _read_bytes_table(file_bytes: bytes) -> pd.DataFrame: + """ + Read EBD/Sampling tables from bytes. + + Supports: + - TSV (tab-separated) plain + - gzip-compressed TSV + - zip containing a TSV/TXT/CSV + + Drops any 'Unnamed:*' columns. + """ + if not file_bytes: + raise ValueError("Empty file bytes.") + + # ZIP container + if zipfile.is_zipfile(io.BytesIO(file_bytes)): + with tempfile.TemporaryDirectory() as td: + zp = os.path.join(td, "f.zip") + with open(zp, "wb") as f: + f.write(file_bytes) + + with zipfile.ZipFile(zp, "r") as zf: + names = zf.namelist() + cand = [n for n in names if n.lower().endswith((".txt", ".tsv", ".csv"))] + if not cand: + raise ValueError("ZIP does not contain a .txt/.tsv/.csv table.") + target = cand[0] + with zf.open(target) as zfh: + raw = zfh.read() + return _read_bytes_table(raw) + + # GZIP container + if file_bytes[:2] == b"\x1f\x8b": + try: + raw = gzip.decompress(file_bytes) + except Exception as e: + raise ValueError(f"Failed to decompress gzip content: {e}") from e + return _read_bytes_table(raw) + + # Plain text table: try TSV then CSV + bio = io.BytesIO(file_bytes) + try: + df = pd.read_csv(bio, sep="\t", dtype=str, low_memory=False) + except Exception: + bio.seek(0) + df = pd.read_csv(bio, sep=",", dtype=str, low_memory=False) + + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + +def _read_path_table(path: str) -> pd.DataFrame: + """ + Read EBD/Sampling tables from a local filesystem path. + + Supports: + - plain TSV/CSV + - .gz + - .zip containing a .txt/.tsv/.csv table + + Drops any 'Unnamed:*' columns. + """ + if not path: + raise ValueError("Empty file path.") + if not os.path.exists(path): + raise ValueError(f"File does not exist: {path}") + + lower = path.lower() + + # ZIP container + if lower.endswith(".zip"): + with zipfile.ZipFile(path, "r") as zf: + names = zf.namelist() + cand = [n for n in names if n.lower().endswith((".txt", ".tsv", ".csv"))] + if not cand: + raise ValueError("ZIP does not contain a .txt/.tsv/.csv table.") + target = cand[0] + with zf.open(target) as zfh: + try: + df = pd.read_csv(zfh, sep="\t", dtype=str, low_memory=False) + except Exception: + zfh.close() + with zf.open(target) as zfh2: + df = pd.read_csv(zfh2, sep=",", dtype=str, low_memory=False) + + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + + # GZIP container + if lower.endswith(".gz"): + try: + df = pd.read_csv(path, sep="\t", dtype=str, low_memory=False, compression="gzip") + except Exception: + df = pd.read_csv(path, sep=",", dtype=str, low_memory=False, compression="gzip") + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + + # Plain text table + try: + df = pd.read_csv(path, sep="\t", dtype=str, low_memory=False) + except Exception: + df = pd.read_csv(path, sep=",", dtype=str, low_memory=False) + + df = df.loc[:, ~df.columns.astype(str).str.startswith("Unnamed:")] + return df + + +def _read_table_input(table_input: Any) -> pd.DataFrame: + """ + Read table either from bytes (old FileInput workflow) or from local path. + """ + if isinstance(table_input, (str, os.PathLike)): + return _read_path_table(os.fspath(table_input)) + return _read_bytes_table(table_input) + +def _ensure_cols(df: pd.DataFrame, cols: Sequence[str], label: str) -> None: + missing = [c for c in cols if c not in df.columns] + if missing: + raise ValueError(f"Missing required columns in {label}: {missing}") + + +def _load_polygon(polygon_source: Any, filename_hint: str) -> "gpd.GeoDataFrame": + """ + Load polygon from a local path or bytes. Supports: + - zipped shapefile (.zip) + - GeoJSON / JSON + Returns GeoDataFrame in EPSG:4326. + """ + if gpd is None: + raise ImportError("geopandas is required for polygon operations.") + + if isinstance(polygon_source, (str, os.PathLike)): + path = os.fspath(polygon_source) + if not os.path.exists(path): + raise ValueError(f"Polygon file does not exist: {path}") + if not os.path.isfile(path): + raise ValueError(f"Polygon path is not a file: {path}") + + lower = path.lower() + if lower.endswith(".zip"): + with tempfile.TemporaryDirectory() as td: + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(td) + + shp = None + for root, _dirs, files in os.walk(td): + for fn in files: + if fn.lower().endswith(".shp"): + shp = os.path.join(root, fn) + break + if shp: + break + if not shp: + raise ValueError("Polygon ZIP does not contain a .shp file.") + poly = gpd.read_file(shp) + else: + poly = gpd.read_file(path) + else: + polygon_bytes = polygon_source + with tempfile.TemporaryDirectory() as td: + if (filename_hint or "").lower().endswith(".zip") or zipfile.is_zipfile(io.BytesIO(polygon_bytes)): + zp = os.path.join(td, "poly.zip") + with open(zp, "wb") as f: + f.write(polygon_bytes) + with zipfile.ZipFile(zp, "r") as zf: + zf.extractall(td) + + shp = None + for root, _dirs, files in os.walk(td): + for fn in files: + if fn.lower().endswith(".shp"): + shp = os.path.join(root, fn) + break + if shp: + break + if not shp: + raise ValueError("Polygon ZIP does not contain a .shp file.") + poly = gpd.read_file(shp) + else: + fp = os.path.join(td, "poly.geojson") + with open(fp, "wb") as f: + f.write(polygon_bytes) + poly = gpd.read_file(fp) + + if poly.empty: + raise ValueError("Polygon contains no features.") + + if poly.crs is None: + poly = poly.set_crs("EPSG:4326") + else: + poly = poly.to_crs("EPSG:4326") + + return poly + +def _load_bbox_polygon(bbox: Sequence[float]) -> "gpd.GeoDataFrame": + """ + Build polygon GeoDataFrame from bbox: + (west, south, east, north) in EPSG:4326. + """ + if gpd is None or box is None: + raise ImportError("geopandas + shapely are required for bbox operations.") + + if bbox is None or len(bbox) != 4: + raise ValueError("BBox must contain exactly 4 values: west, south, east, north.") + + west, south, east, north = [float(v) for v in bbox] + + if not (-180 <= west <= 180 and -180 <= east <= 180): + raise ValueError("Invalid bbox: longitude must be between -180 and 180.") + if not (-90 <= south <= 90 and -90 <= north <= 90): + raise ValueError("Invalid bbox: latitude must be between -90 and 90.") + if west >= east: + raise ValueError("Invalid bbox: west must be smaller than east.") + if south >= north: + raise ValueError("Invalid bbox: south must be smaller than north.") + + geom = box(west, south, east, north) + return gpd.GeoDataFrame({"name": ["bbox_region"]}, geometry=[geom], crs="EPSG:4326") + + +def _resolve_spatial_filter( + polygon_bytes: Optional[Any] = None, + polygon_filename_hint: Optional[str] = None, + bbox: Optional[Sequence[float]] = None, +) -> "gpd.GeoDataFrame": + """ + Resolve spatial filter source into a single polygon GeoDataFrame in EPSG:4326. + + Exactly one of: + - polygon_bytes/path + - bbox + must be provided. + """ + has_polygon = bool(polygon_bytes) + has_bbox = bbox is not None + + if has_polygon and has_bbox: + raise ValueError("Provide either polygon_bytes or bbox, not both.") + if not has_polygon and not has_bbox: + raise ValueError("Provide either polygon_bytes or bbox.") + + if has_polygon: + return _load_polygon(polygon_bytes, polygon_filename_hint or "") + return _load_bbox_polygon(bbox) + + + +def _parse_obs_datetime(df: pd.DataFrame) -> pd.Series: + """ + Parse timestamp from OBSERVATION DATE and TIME OBSERVATIONS STARTED. + """ + d_str = df["OBSERVATION DATE"].fillna("").astype(str).str.strip() + t_str = df.get("TIME OBSERVATIONS STARTED", pd.Series([""] * len(df))).fillna("").astype(str).str.strip() + dt_full = np.where(t_str != "", d_str + " " + t_str, d_str) + return pd.to_datetime(dt_full, errors="coerce") + + +def _parse_counts(df: pd.DataFrame, treat_x_as_one: bool) -> pd.Series: + """ + Parse OBSERVATION COUNT as numeric; supports 'X' for unknown counts. + """ + raw = df["OBSERVATION COUNT"].fillna("").astype(str).str.strip().str.upper() + if treat_x_as_one: + raw = raw.replace({"X": "1"}) + num = pd.to_numeric(raw, errors="coerce") + return num + + +def _normalize_protocol_values(values: Optional[List[str]]) -> Optional[set]: + """ + Normalize protocol values for matching. + """ + if not values: + return None + return {str(v).strip() for v in values if str(v).strip()} + + +def _apply_vetting(m: pd.DataFrame, vet: VettingOptions) -> pd.DataFrame: + """ + Apply vetting filters to merged observations+sampling dataframe. + """ + out = m.copy() + + # REVIEWED / APPROVED / ALL SPECIES REPORTED (AND logic if multiple are True) + if vet.require_reviewed and "REVIEWED" in out.columns: + out = out[_truthy(out["REVIEWED"])] + + if vet.require_approved and "APPROVED" in out.columns: + out = out[_truthy(out["APPROVED"])] + + if vet.require_all_species_reported and "ALL SPECIES REPORTED" in out.columns: + out = out[_truthy(out["ALL SPECIES REPORTED"])] + + # Protocol filtering (optional) + allowed = _normalize_protocol_values(vet.allowed_protocols) + + if allowed: + allowed_norm = {str(a).strip() for a in allowed if str(a).strip()} + + # 1) PROTOCOL TYPE (preferred) + if "PROTOCOL TYPE" in out.columns: + out = out[out["PROTOCOL TYPE"].astype(str).str.strip().isin(allowed_norm)] + + # 2) PROTOCOL NAME (common in sampling file) + elif "PROTOCOL NAME" in out.columns: + out = out[out["PROTOCOL NAME"].astype(str).str.strip().isin(allowed_norm)] + + # 3) OBSERVATION TYPE (common in EBD) + elif "OBSERVATION TYPE" in out.columns: + out = out[out["OBSERVATION TYPE"].astype(str).str.strip().isin(allowed_norm)] + + # 4) Fallback to PROTOCOL CODE only if UI supplies codes + elif "PROTOCOL CODE" in out.columns: + allowed_u = {a.upper() for a in allowed_norm} + out = out[out["PROTOCOL CODE"].astype(str).str.strip().str.upper().isin(allowed_u)] + + + # Exclude incidental/historical (optional) + if vet.exclude_incidental_historical and "PROTOCOL TYPE" in out.columns: + bad = {"Incidental", "Historical"} + out = out[~out["PROTOCOL TYPE"].astype(str).str.strip().isin(bad)] + + # Duration bounds (optional) + if "DURATION MINUTES" in out.columns: + dur = pd.to_numeric(out["DURATION MINUTES"], errors="coerce") + out = out[(dur.isna()) | ((dur >= vet.duration_min_minutes) & (dur <= vet.duration_max_minutes))] + + # Distance bounds (optional) + if "EFFORT DISTANCE KM" in out.columns: + dist = pd.to_numeric(out["EFFORT DISTANCE KM"], errors="coerce") + out = out[(dist.isna()) | ((dist >= vet.distance_min_km) & (dist <= vet.distance_max_km))] + + # Coordinates + if vet.require_valid_coords: + out = out[out["latitude"].notna() & out["longitude"].notna()] + + # Require valid timestamp + out = out[out["__dt"].notna()] + return out + +def _write_manifest(path: str, payload: Dict[str, Any]) -> None: + """ + Write JSON manifest to disk. + """ + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, ensure_ascii=False, indent=2) + +def _assign_n_day_bins( + datetimes: pd.Series, + start_date: dt.date, + end_date: dt.date, + step_days: int, +) -> pd.DataFrame: + """ + Assign each datetime to an N-day bin starting from start_date. + + Returns a DataFrame with: + - time_bin_start + - time_bin_end + + Both are strings in YYYY-MM-DD format. + """ + if step_days < 1: + raise ValueError("step_days must be >= 1.") + + ts = pd.to_datetime(datetimes, errors="coerce") + start_ts = pd.Timestamp(start_date) + end_ts = pd.Timestamp(end_date) + + day_offsets = (ts.dt.normalize() - start_ts).dt.days + bin_index = (day_offsets // step_days).astype("Int64") + + bin_start = start_ts + pd.to_timedelta(bin_index * step_days, unit="D") + bin_end = bin_start + pd.to_timedelta(step_days - 1, unit="D") + bin_end = bin_end.where(bin_end <= end_ts, end_ts) + + return pd.DataFrame( + { + "time_bin_start": bin_start.dt.strftime("%Y-%m-%d"), + "time_bin_end": bin_end.dt.strftime("%Y-%m-%d"), + }, + index=datetimes.index, + ) + +def _assign_grid_nodes( + df: pd.DataFrame, + grid_step_deg: float, + origin_west: float, + origin_south: float, +) -> pd.DataFrame: + """ + Assign observations to regular lon/lat grid nodes. + + Grid nodes are anchored at (origin_west, origin_south) and repeated every + grid_step_deg degrees. + + Each observation is assigned to exactly one nearest node, equivalent to + belonging to the square cell: + - lon_node ± 0.5 * grid_step_deg + - lat_node ± 0.5 * grid_step_deg + + Returns a copy of df with: + - grid_lon + - grid_lat + """ + if grid_step_deg <= 0: + raise ValueError("grid_step_deg must be > 0 for grid assignment.") + + out = df.copy() + + lon_offset = (out["longitude"] - origin_west) / grid_step_deg + lat_offset = (out["latitude"] - origin_south) / grid_step_deg + + out["grid_lon"] = origin_west + np.round(lon_offset) * grid_step_deg + out["grid_lat"] = origin_south + np.round(lat_offset) * grid_step_deg + + out["grid_lon"] = out["grid_lon"].astype(float) + out["grid_lat"] = out["grid_lat"].astype(float) + + return out + +def _safe_divide(num: pd.Series, den: pd.Series) -> pd.Series: + """ + Safe division returning NaN when denominator is zero or missing. + """ + n = pd.to_numeric(num, errors="coerce") + d = pd.to_numeric(den, errors="coerce") + return n / d.where(d > 0) + +def aggregate_ebird_to_files( + *, + ebd_bytes: Any, + sampling_bytes: Any, + polygon_bytes: Optional[bytes] = None, + polygon_filename_hint: str = "", + bbox: Optional[Sequence[float]] = None, + ebd_filename_hint: str = "", + sampling_filename_hint: str = "", + region_id: str, + agg: AggregationOptions, + vet: VettingOptions, + out_counts_csv: str, + out_presence_csv: str, + manifest_json: Optional[str] = None, +) -> List[str]: + """ + Read EBD + Sampling Event data, apply vetting and spatial filters, and aggregate in N-day bins and by species. + Outputs: + - counts CSV (A): time_bin_start, time_bin_end, location-lat, location-long, species, + total_count, n_checklists, n_checklists_all, n_complete_checklists, + n_detected_complete_checklists, sum_duration_hours_complete, + sum_party_hours_complete, reporting_rate, count_per_complete_checklist, + count_per_hour, count_per_party_hour_complete, + mean_count_when_detected, region_id + - presence CSV (B): time_bin_start, time_bin_end, location-lat, location-long, species, + presence, n_checklists, n_checklists_all, n_complete_checklists, + n_detected_complete_checklists, reporting_rate, region_id + Returns: + - sorted list of unique species found in the counts output. + """ + if gpd is None or Point is None: + raise ImportError("geopandas + shapely are required for polygon operations.") + if agg.step_days < 1: + raise ValueError("Aggregation step_days must be >= 1.") + if agg.grid_step_deg < 0: + raise ValueError("Aggregation grid_step_deg must be >= 0.") + + obs = _read_table_input(ebd_bytes) + samp = _read_table_input(sampling_bytes) + + _ensure_cols( + obs, + [ + "SAMPLING EVENT IDENTIFIER", + "LATITUDE", + "LONGITUDE", + "OBSERVATION DATE", + "SCIENTIFIC NAME", + "COMMON NAME", + "OBSERVATION COUNT", + ], + "EBD observations", + ) + _ensure_cols(samp, ["SAMPLING EVENT IDENTIFIER"], "Sampling events") + + key = "SAMPLING EVENT IDENTIFIER" + merged = obs.merge( + samp.drop_duplicates(subset=[key]), + on=key, + how="left", + suffixes=("", "_samp"), + ) + + merged["__dt"] = _parse_obs_datetime(merged) + merged["latitude"] = pd.to_numeric(merged["LATITUDE"], errors="coerce") + merged["longitude"] = pd.to_numeric(merged["LONGITUDE"], errors="coerce") + + m = _apply_vetting(merged, vet) + + poly = _resolve_spatial_filter( + polygon_bytes=polygon_bytes, + polygon_filename_hint=polygon_filename_hint, + bbox=bbox, + ) + poly_union = poly.dissolve().geometry.iloc[0] + + minx, miny, maxx, maxy = poly.total_bounds + if bbox is not None: + origin_west, origin_south = float(bbox[0]), float(bbox[1]) + else: + origin_west, origin_south = float(minx), float(miny) + + gdf = gpd.GeoDataFrame( + m, + geometry=[Point(xy) for xy in zip(m["longitude"], m["latitude"])], + crs="EPSG:4326", + ) + gdf = gdf[gdf.intersects(poly_union)].drop(columns=["geometry"]) + m = pd.DataFrame(gdf) + + # 2) Start/End date limits (inclusive) + start_dt = pd.Timestamp(agg.start_date) + end_dt = pd.Timestamp(agg.end_date) + pd.Timedelta(days=1) - pd.Timedelta(seconds=1) + m = m[(m["__dt"] >= start_dt) & (m["__dt"] <= end_dt)] + + # Parse counts + cnt = _parse_counts(m, treat_x_as_one=agg.treat_x_as_one) + # If count is missing (including treat_x_as_one=False), default to 1.0 for presence-like behavior + m["__count"] = cnt.fillna(1.0) + + # Optional clip + if vet.clip_counts_above and vet.clip_counts_above > 0: + m["__count"] = m["__count"].clip(upper=vet.clip_counts_above) + + # Species label for aggregation + m["species"] = m["SCIENTIFIC NAME"].fillna(m["COMMON NAME"]).astype(str) + + # Time binning: fixed-size bins in N days, anchored at agg.start_date + bins = _assign_n_day_bins( + m["__dt"], + start_date=agg.start_date, + end_date=agg.end_date, + step_days=agg.step_days, + ) + m["time_bin_start"] = bins["time_bin_start"] + m["time_bin_end"] = bins["time_bin_end"] + + # Spatial aggregation: + # - grid_step_deg == 0: keep original observation coordinates + # - grid_step_deg > 0: assign to regular grid nodes and aggregate by node + if agg.grid_step_deg > 0: + m = _assign_grid_nodes( + m, + grid_step_deg=agg.grid_step_deg, + origin_west=origin_west, + origin_south=origin_south, + ) + loc_lat_col = "grid_lat" + loc_lon_col = "grid_lon" + else: + loc_lat_col = "latitude" + loc_lon_col = "longitude" + + # Common grouping keys + spatial_time_keys = ["time_bin_start", "time_bin_end", loc_lat_col, loc_lon_col] + species_keys = spatial_time_keys + ["species"] + + # Complete checklist flag + if "ALL SPECIES REPORTED" in m.columns: + m["__complete_checklist"] = _truthy(m["ALL SPECIES REPORTED"]) + else: + m["__complete_checklist"] = False + + # Duration in hours + if "DURATION MINUTES" in m.columns: + m["__duration_hours"] = pd.to_numeric(m["DURATION MINUTES"], errors="coerce") / 60.0 + else: + m["__duration_hours"] = np.nan + # Number of observers + if "NUMBER OBSERVERS" in m.columns: + m["__n_observers"] = pd.to_numeric(m["NUMBER OBSERVERS"], errors="coerce") + else: + m["__n_observers"] = np.nan + # ------------------------------------------------------------------ + # 1) Denominator table from unique checklists at time_bin + spatial unit + # ------------------------------------------------------------------ + checklist_cols = [ + key, + "time_bin_start", + "time_bin_end", + loc_lat_col, + loc_lon_col, + "__complete_checklist", + "__duration_hours", + "__n_observers", + ] + checklist_frame = m[checklist_cols].drop_duplicates(subset=[key]) + + checklist_frame["__duration_hours_complete_only"] = checklist_frame["__duration_hours"].where( + checklist_frame["__complete_checklist"], + np.nan, + ) + + checklist_frame["__party_hours"] = checklist_frame["__duration_hours"] * checklist_frame["__n_observers"] + checklist_frame["__party_hours_complete_only"] = checklist_frame["__party_hours"].where( + checklist_frame["__complete_checklist"], + np.nan, + ) + + denom = ( + checklist_frame + .groupby(spatial_time_keys, dropna=False) + .agg( + n_checklists_all=(key, pd.Series.nunique), + n_complete_checklists=("__complete_checklist", "sum"), + sum_duration_hours_complete=("__duration_hours_complete_only", "sum"), + sum_party_hours_complete=("__party_hours_complete_only", "sum"), + ) + .reset_index() + ) + + # ------------------------------------------------------------------ + # 2) Species table from detections + # ------------------------------------------------------------------ + grp = m.groupby(species_keys, dropna=False) + + counts = grp.agg( + total_count=("__count", "sum"), + n_checklists=(key, pd.Series.nunique), + ).reset_index() + + pres = grp.agg( + presence=("__count", lambda x: 1), + n_checklists=(key, pd.Series.nunique), + ).reset_index() + + # ------------------------------------------------------------------ + # 3) Species table from detections on complete checklists only + # ------------------------------------------------------------------ + detected_complete = m[m["__complete_checklist"]].copy() + + if len(detected_complete) > 0: + grp_complete = detected_complete.groupby(species_keys, dropna=False) + det_complete = grp_complete.agg( + n_detected_complete_checklists=(key, pd.Series.nunique), + ).reset_index() + else: + det_complete = pd.DataFrame(columns=species_keys + ["n_detected_complete_checklists"]) + + # ------------------------------------------------------------------ + # 4) Join denominator + derived metrics + # ------------------------------------------------------------------ + counts = counts.merge(denom, on=spatial_time_keys, how="left") + counts = counts.merge(det_complete, on=species_keys, how="left") + counts["n_detected_complete_checklists"] = counts["n_detected_complete_checklists"].fillna(0) + + counts["reporting_rate"] = _safe_divide( + counts["n_detected_complete_checklists"], + counts["n_complete_checklists"], + ) + counts["count_per_complete_checklist"] = _safe_divide( + counts["total_count"], + counts["n_complete_checklists"], + ) + counts["count_per_hour"] = _safe_divide( + counts["total_count"], + counts["sum_duration_hours_complete"], + ) + counts["count_per_party_hour_complete"] = _safe_divide( + counts["total_count"], + counts["sum_party_hours_complete"], + ) + counts["mean_count_when_detected"] = _safe_divide( + counts["total_count"], + counts["n_checklists"], + ) + + counts["region_id"] = region_id + counts = counts.rename(columns={loc_lat_col: "location-lat", loc_lon_col: "location-long"}) + + pres = pres.merge(denom, on=spatial_time_keys, how="left") + pres = pres.merge(det_complete, on=species_keys, how="left") + pres["n_detected_complete_checklists"] = pres["n_detected_complete_checklists"].fillna(0) + + pres["reporting_rate"] = _safe_divide( + pres["n_detected_complete_checklists"], + pres["n_complete_checklists"], + ) + pres["count_per_complete_checklist"] = np.nan + pres["count_per_hour"] = np.nan + pres["count_per_party_hour_complete"] = np.nan + pres["mean_count_when_detected"] = np.nan + + pres["region_id"] = region_id + pres = pres.rename(columns={loc_lat_col: "location-lat", loc_lon_col: "location-long"}) + + os.makedirs(os.path.dirname(os.path.abspath(out_counts_csv)), exist_ok=True) + counts.to_csv(out_counts_csv, index=False, encoding="utf-8") + + os.makedirs(os.path.dirname(os.path.abspath(out_presence_csv)), exist_ok=True) + pres.to_csv(out_presence_csv, index=False, encoding="utf-8") + + if manifest_json: + if bbox is not None: + west, south, east, north = [float(v) for v in bbox] + spatial_filter = { + "type": "bbox", + "west": west, + "south": south, + "east": east, + "north": north, + } + else: + spatial_filter = { + "type": "polygon", + "filename_hint": polygon_filename_hint or "", + } + + payload: Dict[str, Any] = { + "created_at": dt.datetime.now().isoformat(), + "region_id": region_id, + "source_mode": "EBD + Sampling Event", + "spatial_filter": spatial_filter, + "time": { + "start": str(agg.start_date), + "end": str(agg.end_date), + "step_days": int(agg.step_days), + }, + "grid": { + "grid_step_deg": float(agg.grid_step_deg), + "origin_west": float(origin_west), + "origin_south": float(origin_south), + "mode": "grid" if agg.grid_step_deg > 0 else "original_coordinates", + }, + "derived_metrics": [ + "reporting_rate", + "count_per_complete_checklist", + "n_complete_checklists", + "count_per_hour", + "count_per_party_hour_complete", + "mean_count_when_detected", + ], + "vetting": vet.__dict__, + "outputs": { + "agg_counts_csv": out_counts_csv, + "agg_presence_csv": out_presence_csv, + }, + } + _write_manifest(manifest_json, payload) + + # 3) Species list for UI + species_list = sorted(counts["species"].dropna().astype(str).unique().tolist()) + return species_list + + +def read_species_from_agg_counts(agg_counts_csv: str) -> List[str]: + """ + Read unique species list from aggregated counts CSV. + """ + if not os.path.exists(agg_counts_csv): + return [] + df = pd.read_csv(agg_counts_csv, usecols=["species"]) + return sorted(df["species"].dropna().astype(str).unique().tolist()) + + +def export_tracks_from_aggregated_counts( + *, + agg_counts_csv: str, + tracks_csv: str, + region_id: str, + id_mode: str = "species", + species_filter: Optional[List[str]] = None, +) -> None: + """ + Convert aggregated counts file into Movebank-like pseudo-tracks CSV for ECODATA-Animate. + + If species_filter is provided and non-empty, export only those species. + + Output columns: + - timestamp + - location-long + - location-lat + - individual-local-identifier + - species + - count + - bin_id + - region_id + """ + if not os.path.exists(agg_counts_csv): + raise FileNotFoundError(f"Aggregated counts file not found: {agg_counts_csv}") + + df = pd.read_csv(agg_counts_csv) + + if species_filter: + keep = {str(s).strip() for s in species_filter if str(s).strip()} + if keep: + df = df[df["species"].astype(str).isin(keep)] + + ts = pd.to_datetime(df["time_bin_start"], errors="coerce") + df["timestamp"] = ts.dt.strftime("%Y-%m-%dT%H:%M:%S") + df["bin_id"] = df["time_bin_start"].astype(str) + + if id_mode == "species|region": + df["individual-local-identifier"] = df["species"].astype(str) + "|region:" + str(region_id) + else: + df["individual-local-identifier"] = df["species"].astype(str) + + out = pd.DataFrame( + { + "timestamp": df["timestamp"], + "location-long": df["location-long"], + "location-lat": df["location-lat"], + "individual-local-identifier": df["individual-local-identifier"], + "species": df["species"], + "count": df.get("total_count", 1), + "bin_id": df["bin_id"], + "region_id": region_id, + + "total_count": df.get("total_count"), + "n_checklists": df.get("n_checklists"), + "n_checklists_all": df.get("n_checklists_all"), + "n_complete_checklists": df.get("n_complete_checklists"), + "n_detected_complete_checklists": df.get("n_detected_complete_checklists"), + "sum_duration_hours_complete": df.get("sum_duration_hours_complete"), + "sum_party_hours_complete": df.get("sum_party_hours_complete"), + "reporting_rate": df.get("reporting_rate"), + "count_per_complete_checklist": df.get("count_per_complete_checklist"), + "count_per_hour": df.get("count_per_hour"), + "count_per_party_hour_complete": df.get("count_per_party_hour_complete"), + "mean_count_when_detected": df.get("mean_count_when_detected"), + } +) + + os.makedirs(os.path.dirname(os.path.abspath(tracks_csv)), exist_ok=True) + out.to_csv(tracks_csv, index=False, encoding="utf-8") From 9a6e5bd1a5386dc677bf3f18212374bb9b17e84e Mon Sep 17 00:00:00 2001 From: olekshche Date: Wed, 13 May 2026 14:10:06 +0300 Subject: [PATCH 08/17] Fix cleanup of temporary trimmed CSV file --- ecodata/annotation_eng_func.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index 1e0fae5..c0b29d4 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -107,6 +107,18 @@ def get_nc_bounds(nc_path: str): finally: ds.close() +def remove_temporary_trimmed_file(trimmed_path): + """Remove temporary trimmed.csv created during spatial filtering.""" + if trimmed_path is None: + return + + try: + path = Path(trimmed_path) + if path.exists() and path.is_file(): + path.unlink() + print(f"[INFO] Temporary file removed: {path}") + except Exception as e: + print(f"[WARNING] Could not remove temporary trimmed.csv: {e}") def load_vector_extent_info(path): try: @@ -172,9 +184,13 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele print("Interpolation method:", interpolation_method) # === Step 1: Spatial filtering === - df_filtered, _ = filter_points_within_boundary(movebank_path, selected_ids, boundary_path, bbox=bbox) + df_filtered, trimmed_path = filter_points_within_boundary( + movebank_path, selected_ids, boundary_path, bbox=bbox + ) + if df_filtered.empty: print("[WARNING] No points within the boundary.") + remove_temporary_trimmed_file(trimmed_path) return # ===*** Time prefiltering (union across selected variables) === @@ -182,6 +198,7 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele df_filtered = filter_points_within_timerange(df_filtered, nc_start, nc_end) if df_filtered.empty: print("[WARNING] No points within the NC time window after prefiltering.") + remove_temporary_trimmed_file(trimmed_path) return # ===*** @@ -194,6 +211,7 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele categorical_vars=categorical_vars) if result is None: print("[ERROR] Environmental data was not loaded.") + remove_temporary_trimmed_file(trimmed_path) return df_annotated, ann_nc_start, ann_nc_end = result @@ -246,7 +264,6 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele print(f"[DEBUG] Filled '{var}': total={filled_total}, within-NC-window={filled_in_nc}") else: print(f"[WARNING] Column '{var}' not found in annotated DataFrame.") -##### # === Step 3: Time filtering === df_time_filtered = df_annotated.copy() @@ -276,6 +293,7 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele print(f"[INFO] Saved {len(unique_ids)} individual files to {output_folder}") else: print("[WARNING] Column 'individual_local_identifier' not found. Skipping per-ID export.") + remove_temporary_trimmed_file(trimmed_path) def filter_points_within_boundary(movebank_path, selected_ids, boundary_path=None, bbox=None): From 9c8cd627b03d0a9e3e738130ebdd22bece65680c Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Wed, 13 May 2026 13:52:41 -0600 Subject: [PATCH 09/17] Update pip install options in dev environment file --- ecodata-dev-env.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecodata-dev-env.yml b/ecodata-dev-env.yml index 6066158..be6ad26 100644 --- a/ecodata-dev-env.yml +++ b/ecodata-dev-env.yml @@ -24,5 +24,5 @@ dependencies: - click - build # - panel-jstree # panel needs to updated to >1 first - - -e . --global-option="--no-deps" + - --no-deps -e . name: eco-dev From d007e922e8be7464a8a11f0c067277110f03762e Mon Sep 17 00:00:00 2001 From: olekshche Date: Fri, 15 May 2026 15:57:28 +0300 Subject: [PATCH 10/17] Adapt the projected x/y bilinear interpolation logic from #191 to the updated Annotation Engine UI/backend structure --- ecodata-env.yml | 3 +- ecodata/annotation_eng_func.py | 395 ++++++++++++++++++++-- ecodata/app/apps/annotation_engine_app.py | 339 +++++++++++-------- 3 files changed, 574 insertions(+), 163 deletions(-) diff --git a/ecodata-env.yml b/ecodata-env.yml index f4519dc..ddb39d7 100644 --- a/ecodata-env.yml +++ b/ecodata-env.yml @@ -27,4 +27,5 @@ dependencies: - gdown<4.6 # gdown 4.6.something has a problem with our gdrive files - distributed - geographiclib -- h5netcdf \ No newline at end of file +- h5netcdf +- pyproj \ No newline at end of file diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index c0b29d4..fbfdc6a 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -9,11 +9,12 @@ import numpy as np from datetime import datetime import rasterio +from pyproj import CRS, Transformer LEVEL_DIM_CANDIDATES = ("isobaricInhPa","isobaric_in_hPa","level","lev","plev","pressure","pressure_level") -def safe_open_nc_with_time_decoding(path): +def safe_open_nc_with_time_decoding(path, time_name: str | None = None): """ Opens a NetCDF file with support for non-standard calendars: julian, gregorian, 360_day, noleap, etc. @@ -24,7 +25,9 @@ def safe_open_nc_with_time_decoding(path): try: ds = xr.open_dataset(path, decode_times=False, chunks="auto") - time_name = _detect_time_name(ds) + if time_name is None: + time_name = _detect_time_name(ds) + if time_name is None: raise ValueError("No time-like coordinate/variable found (e.g., 'time', 'valid_time').") @@ -62,7 +65,11 @@ def safe_open_nc_with_time_decoding(path): raise RuntimeError(f"[ERROR] Failed to decode time using cftime for {path}: {e}") -def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str]): +def get_nc_timerange_for_selected( + env_var_map: dict, + selected_env_vars: list[str], + time_name: str | None = None, +): """ Return union [nc_start, nc_end] across all selected variables. If time is missing for all → (None, None). @@ -72,7 +79,7 @@ def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str nc_path = env_var_map.get(v) if not nc_path: continue - ds = safe_open_nc_with_time_decoding(nc_path) + ds = safe_open_nc_with_time_decoding(nc_path, time_name=time_name) try: if ("time" in ds.coords) or ("time" in ds.variables): tmin = pd.to_datetime(ds["time"].values.min()) @@ -84,18 +91,23 @@ def get_nc_timerange_for_selected(env_var_map: dict, selected_env_vars: list[str return nc_start, nc_end -def get_nc_bounds(nc_path: str): +def get_nc_bounds(nc_path: str, env_coord_names: dict | None = None): """ Returns a dictionary of boundaries from .nc in CRS WGS84: {"S": ..., "N": ..., "W": ..., "E": ...} """ - ds = safe_open_nc_with_time_decoding(nc_path) + env_coord_names = env_coord_names or {} + ds = safe_open_nc_with_time_decoding(nc_path, time_name=env_coord_names.get("env_time")) # candidate coordinate names try: - lat_candidates = ("lat", "latitude", "y") - lon_candidates = ("lon", "longitude", "x","long") + lat_name = env_coord_names.get("env_lat") + lon_name = env_coord_names.get("env_lon") + + if not lat_name or not lon_name: + lat_candidates = ("lat", "latitude", "Latitude") + lon_candidates = ("lon", "longitude", "Longitude", "long") + lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) - lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) - lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) if lat_name is None or lon_name is None: raise ValueError("Could not detect lat/lon coordinate names in NetCDF") @@ -163,6 +175,7 @@ def load_taxa_and_ids_from_csv(file_path): def start_annotation_process(env_var_map, selected_env_vars, movebank_path, selected_ids, boundary_path, interpolation_method, bbox=None, smoothing_k: int = 2, out_csv_path=None, coord_spec=None, + env_coord_names: dict | None = None, continuous_vars=None, categorical_vars=None, apply_value_correction: bool = False, value_scale_factor: float = 1.0, @@ -182,6 +195,18 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele print("Movebank file:", movebank_path) print("Boundary file:", boundary_path) print("Interpolation method:", interpolation_method) + env_coord_names = env_coord_names or {} + + # bridge from the current coord_spec logic to the #191 commit's env_coord_names naming. + + if not env_coord_names and coord_spec: + env_coord_names = { + "env_time": coord_spec.get("time"), + "env_lat": coord_spec.get("lat"), + "env_lon": coord_spec.get("lon"), + "env_x": None, + "env_y": None, + } # === Step 1: Spatial filtering === df_filtered, trimmed_path = filter_points_within_boundary( @@ -194,7 +219,11 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele return # ===*** Time prefiltering (union across selected variables) === - nc_start, nc_end = get_nc_timerange_for_selected(env_var_map, selected_env_vars) + nc_start, nc_end = get_nc_timerange_for_selected( + env_var_map, + selected_env_vars, + time_name=env_coord_names.get("env_time"), + ) df_filtered = filter_points_within_timerange(df_filtered, nc_start, nc_end) if df_filtered.empty: print("[WARNING] No points within the NC time window after prefiltering.") @@ -203,12 +232,18 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele # ===*** # === Step 2: Loading and interpolation of environmental data === - result = load_selected_environmental_data(df_filtered, env_var_map, - selected_env_vars, movebank_path, - interpolation_method, smoothing_k=smoothing_k, - coord_spec=coord_spec, - continuous_vars=continuous_vars, - categorical_vars=categorical_vars) + result = load_selected_environmental_data( + df_filtered, + env_var_map, + selected_env_vars, + movebank_path, + interpolation_method, + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, + continuous_vars=continuous_vars, + categorical_vars=categorical_vars, + ) if result is None: print("[ERROR] Environmental data was not loaded.") remove_temporary_trimmed_file(trimmed_path) @@ -274,7 +309,7 @@ def start_annotation_process(env_var_map, selected_env_vars, movebank_path, sele out_path = Path(out_csv_path) else: out_path = Path(movebank_path).parent / "annotated_env.csv" - df_time_filtered = df_time_filtered.drop(columns=["geometry", "nc_lat", "nc_lon"], errors="ignore") + df_time_filtered = df_time_filtered.drop(columns=["geometry", "nc_lat", "nc_lon", "x", "y"], errors="ignore") df_time_filtered.to_csv(out_path, index=False, encoding="utf-8-sig", date_format="%Y-%m-%d %H:%M:%S") print(f"[INFO] Final filtered annotation saved to {out_path}") @@ -411,9 +446,12 @@ def interpolate_missing_coordinates(df: pd.DataFrame) -> pd.DataFrame: def load_selected_environmental_data(df, env_var_map, selected_vars, - movebank_path, interpolation_method="Nearest neighbour", smoothing_k: int = 2, + movebank_path, interpolation_method="Nearest neighbour", + smoothing_k: int = 2, coord_spec=None, - continuous_vars=None, categorical_vars=None): + env_coord_names: dict | None = None, + continuous_vars=None, + categorical_vars=None): """ Wrapper that calls the appropriate annotation function depending on the interpolation method. @@ -427,6 +465,10 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, - Categorical/QC + IDW selected: categorical/QC variables are not IDW-averaged; they use nearest spatial grid node + nearest timestep + - Continuous + Bilinear projected x/y: + bilinear interpolation on a projected 1D x/y grid + linear temporal interpolation + - Categorical/QC + Bilinear projected x/y: + not allowed, because bilinear interpolation is not valid for class/flag codes """ label = (interpolation_method or "").strip().lower() label = label.replace("neighbor", "neighbour") # Normalise US/UK spelling @@ -438,6 +480,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, method = (interpolation_method or "").lower() is_nearest = ("nearest" in method) is_idw = ("idw" in method) + is_bilinear = "bilinear" in method # If split lists are not provided, treat everything as "selected_vars" cont = list(continuous_vars or []) @@ -450,11 +493,22 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k, coord_spec=coord_spec ) + if is_idw: return annotate_env_IDW( df, env_var_map, selected_vars, movebank_path, smoothing_k=smoothing_k, coord_spec=coord_spec ) + + if is_bilinear: + return annotate_env_bilinear_projected( + df, + env_var_map, + selected_vars, + movebank_path, + env_coord_names=env_coord_names, + ) + raise ValueError(f"Unknown interpolation method: {interpolation_method}") # If split lists are provided: @@ -521,7 +575,30 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, nc_end = nc_end2 return out_df, nc_start, nc_end + + # 3) Bilinear projected selected: + # continuous -> bilinear projected x/y + linear time + # categorical/QC -> not allowed + if is_bilinear: + if cat: + raise ValueError( + "Bilinear projected interpolation is only valid for continuous variables. " + "Please remove categorical/QC variables or use Nearest/IDW mode." + ) + + bilinear_vars = cont if cont else list(selected_vars or []) + + if not bilinear_vars: + raise ValueError("No continuous variables selected for bilinear projected interpolation.") + return annotate_env_bilinear_projected( + df, + env_var_map, + bilinear_vars, + movebank_path, + env_coord_names=env_coord_names, + ) + raise ValueError(f"Unknown interpolation method: {interpolation_method}") @@ -1017,6 +1094,284 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: out["geometry"] = [Point(lon, lat) for lon, lat in zip(out["nc_lon"], out["nc_lat"])] return out, pd.NaT, pd.NaT +def annotate_env_bilinear_projected( + df, + env_var_map, + selected_vars, + movebank_path, + env_coord_names: dict | None = None, +): + """ + Annotate movement points with environmental values using: + - Spatial: bilinear interpolation on a 1D projected grid (x/y) + - Temporal: linear interpolation in time (xarray interp) + + Tracks input: + - requires lon/lat columns: location_lon, location_lat + - projects lon/lat -> x/y into the env dataset's native CRS using CF metadata + + Env input: + - dataset has 1D x and y coordinate vectors (projected grid) + - dataset provides CF projection metadata so `read_crs_from_cf()` can infer CRS + + Returns: (out_df, pd.NaT, pd.NaT) for signature compatibility. + """ + out = df.copy() + out["timestamp"] = pd.to_datetime(out["timestamp"], dayfirst=True, errors="coerce") + + # Require lon/lat (your code already normalizes movement columns sometimes) + required = ["timestamp", "location_lat", "location_lon"] + out = out.dropna(subset=required) + + env_coord_names = env_coord_names or {} + time_name = env_coord_names.get("env_time") # optional + x_name = env_coord_names.get("env_x") + y_name = env_coord_names.get("env_y") + + if not x_name or not y_name: + raise ValueError( + "Bilinear (projected) requires env_coord_names['env_x'] and ['env_y'] " + "(Projected (x/y) mode)." + ) + if env_coord_names.get("env_lat") or env_coord_names.get("env_lon"): + raise ValueError("Bilinear (projected) requires Projected (x/y) spatial mode, not Geographic (lat/lon).") + + # Target time values (vectorized) + tgt_t = out["timestamp"].to_numpy("datetime64[ns]") + + # Track lon/lat arrays + lon = pd.to_numeric(out["location_lon"], errors="coerce").to_numpy(dtype="float64") + lat = pd.to_numeric(out["location_lat"], errors="coerce").to_numpy(dtype="float64") + + # Drop any rows with bad numeric lon/lat + good = np.isfinite(lon) & np.isfinite(lat) & out["timestamp"].notna().to_numpy() + if not good.all(): + out = out.loc[good].copy() + tgt_t = tgt_t[good] + lon = lon[good] + lat = lat[good] + + # QA columns + out["x"] = np.nan + out["y"] = np.nan + + # Cache CRS/transformer per file path (since you may have multiple labels/files) + crs_cache: dict[str, "CRS"] = {} + + for label in selected_vars: + file_path = env_var_map.get(label) + out[label] = np.nan + + if not file_path or not Path(file_path).is_file(): + print(f"[WARNING] File for {label} not found: {file_path}") + continue + + base_var, target_level = _split_var_and_level(label) + + try: + ds = safe_open_nc_with_time_decoding(file_path, time_name=time_name) + + if base_var not in ds: + print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") + ds.close() + continue + + da = ds[base_var] + dims = list(da.dims) + + # Must be able to interpolate along x/y dims + x_dim = x_name if x_name in dims else None + y_dim = y_name if y_name in dims else None + if x_dim is None or y_dim is None: + ds.close() + raise ValueError( + f"Bilinear requires x/y to be dims of {base_var!r}.\n" + f" Requested x dim: {x_name!r} (is_dim={x_name in dims})\n" + f" Requested y dim: {y_name!r} (is_dim={y_name in dims})\n" + f" Available dims: {dims}" + ) + + # Sort for interpolation stability + ds = _ensure_sorted(ds, y_dim, x_dim) + da = ds[base_var] + dims = list(da.dims) + + if "time" not in dims: + ds.close() + raise ValueError(f"No 'time' dim after decoding for '{base_var}'. dims={dims}") + + # Validate 1D x/y coordinate vectors + gx = np.asarray(ds[x_dim].values) + gy = np.asarray(ds[y_dim].values) + if gx.ndim != 1 or gy.ndim != 1: + ds.close() + raise ValueError( + f"Bilinear method requires 1D coordinate vectors for '{y_dim}' and '{x_dim}'. " + f"Got shapes: {y_dim}={gy.shape}, {x_dim}={gx.shape}." + ) + + # Handle extra dims (pressure level, ensemble, expver, etc.) + extra_dims = [d for d in dims if d not in ("time", y_dim, x_dim)] + if extra_dims: + sel = {} + for d in extra_dims: + if d in LEVEL_DIM_CANDIDATES: + sel[d] = _pick_level_index(ds, d, target_level) + else: + sel[d] = 0 + da = da.isel(**sel).squeeze() # -> (time, y, x) + + # --- CRS inference + projection lon/lat -> x/y ------------------------- + if file_path not in crs_cache: + # Prefer variable-specific grid_mapping lookup by passing base_var + crs_cache[file_path] = read_crs_from_cf(ds, var_name=base_var) + + target_crs = crs_cache[file_path] + x_pts, y_pts = project_tracks_lonlat_to_xy(lon, lat, target_crs=target_crs) + + # Store for QA + out["x"] = x_pts + out["y"] = y_pts + + # --- vectorized xarray interpolation ----------------------------------- + pts = xr.Dataset( + coords={"points": np.arange(len(out))}, + data_vars={ + "time": ("points", tgt_t), + x_dim: ("points", x_pts), + y_dim: ("points", y_pts), + }, + ) + + sampled = da.interp({x_dim: pts[x_dim], y_dim: pts[y_dim], "time": pts["time"]}) + out[label] = sampled.to_numpy() + + ds.close() + + except Exception as e: + print(f"[ERROR] {label}: {e}") + continue + + # If you want: geometry in projected CRS (x,y). Comment out if not needed. + out["geometry"] = [Point(x, y) for x, y in zip(out["x"], out["y"])] + + return out, pd.NaT, pd.NaT + +def read_crs_from_cf(ds: xr.Dataset, var_name: str | None = None) -> CRS: + """ + Infer the projected coordinate reference system (CRS) of a gridded + environmental dataset using CF-convention metadata. + + The function attempts, in order: + 1) to read a CF-compliant ``grid_mapping`` attribute from a data variable, + 2) to construct a CRS from global dataset attributes (e.g. WKT or PROJ), + 3) to read CRS information from a standalone ``crs`` variable. + + This is intended for datasets on projected grids (e.g. NARR, ERA5-Land, + regional climate models) where track data in WGS84 lon/lat must be + transformed to native x/y coordinates before spatial interpolation. + + Parameters + ---------- + ds : xarray.Dataset + Environmental dataset containing projected horizontal coordinates + and CF-compliant projection metadata. + var_name : str or None, optional + Name of a data variable whose ``grid_mapping`` attribute should be + inspected first. If None, variable-specific metadata are skipped. + + Returns + ------- + pyproj.CRS + Coordinate reference system describing the dataset's native + horizontal projection. + + Raises + ------ + ValueError + If no usable CRS information can be inferred from the dataset. + """ + + # 1) If a data variable is given, try its grid_mapping attribute + grid_mapping_name = None + if var_name is not None and var_name in ds: + grid_mapping_name = ds[var_name].attrs.get("grid_mapping") + + # 2) If we have a grid mapping variable, parse it as CF + if grid_mapping_name and grid_mapping_name in ds.variables: + gm = ds[grid_mapping_name] + # xarray keeps attrs as dict; pyproj can build CRS from CF dict + try: + return CRS.from_cf(gm.attrs) + except Exception: + pass + + # 3) Common alternate places: global attrs + # Try "crs_wkt", "spatial_ref" (GDAL), "proj4", "proj" + for key in ("crs_wkt", "spatial_ref", "proj_wkt", "wkt"): + wkt = ds.attrs.get(key) + if isinstance(wkt, str) and wkt.strip(): + return CRS.from_wkt(wkt) + + for key in ("proj4", "proj4text", "proj", "projection"): + proj = ds.attrs.get(key) + if isinstance(proj, str) and proj.strip(): + return CRS.from_string(proj) + + # 4) Sometimes there is a standalone "crs" variable with WKT in attrs + if "crs" in ds.variables: + crs_var = ds["crs"] + for key in ("crs_wkt", "spatial_ref"): + wkt = crs_var.attrs.get(key) + if isinstance(wkt, str) and wkt.strip(): + return CRS.from_wkt(wkt) + # Or CF attrs + try: + return CRS.from_cf(crs_var.attrs) + except Exception: + pass + + raise ValueError("Could not infer CRS from dataset (no usable CF grid_mapping / WKT / proj string found).") + + +def project_tracks_lonlat_to_xy( + lon: np.ndarray, + lat: np.ndarray, + target_crs: CRS, +) -> tuple[np.ndarray, np.ndarray]: + """ + Project track locations from geographic coordinates (longitude, latitude) + to the native x/y coordinate system of a projected environmental grid. + + This function is used to transform animal tracking locations + (WGS84 lon/lat) into the coordinate system of gridded datasets such as + NARR before spatial interpolation using xarray. + + Parameters + ---------- + lon : array-like + Longitudes of track locations in degrees east (EPSG:4326). + lat : array-like + Latitudes of track locations in degrees north (EPSG:4326). + target_crs : pyproj.CRS + Target projected CRS describing the environmental dataset grid. + + Returns + ------- + x : numpy.ndarray + Projected x-coordinates of track locations in the target CRS. + y : numpy.ndarray + Projected y-coordinates of track locations in the target CRS. + """ + + lon = np.asarray(lon, dtype=float) + lat = np.asarray(lat, dtype=float) + + transformer = Transformer.from_crs("EPSG:4326", target_crs, always_xy=True) + x, y = transformer.transform(lon, lat) + return np.asarray(x, dtype=float), np.asarray(y, dtype=float) + + def _safe_remove_existing_file(path, retries: int = 5, delay: float = 0.5): """ Remove an existing file before overwriting it. diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index 6dfb476..2641c15 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -71,6 +71,14 @@ class movebank_annotation_engine(param.Parameterized): nc_time_var = pn.widgets.Select(name="Time variable", options=[], value=None) nc_lat_var = pn.widgets.Select(name="Latitude variable", options=[], value=None) nc_lon_var = pn.widgets.Select(name="Longitude variable", options=[], value=None) + env_spatial_mode = pn.widgets.RadioButtonGroup( + name="Env spatial coordinate mode", + options=["Geographic (lat/lon)", "Projected (x/y)"], + value="Geographic (lat/lon)", + button_type="default", + ) + env_x_select = pn.widgets.Select(name="X coordinate", options=[], value=None) + env_y_select = pn.widgets.Select(name="Y coordinate", options=[], value=None) env_continuous_selector = pn.widgets.MultiSelect( name="Continuous (use Ctrl or ⌘ for multiple selection)", options=[], value=[], height=180 @@ -101,7 +109,11 @@ class movebank_annotation_engine(param.Parameterized): ) interpolation_method = pn.widgets.Select( name="Interpolation method (spatial)", - options=["Nearest neighbor (time-linear)", "Inverse Distance Weighting (time-linear)"], + options=[ + "Nearest neighbor (time-linear)", + "Inverse Distance Weighting (time-linear)", + "Bilinear (projected x/y, time-linear)", + ], value="Inverse Distance Weighting (time-linear)" ) make_annotation_button = pn.widgets.Button(name="Make annotated file", button_type="primary") @@ -194,6 +206,7 @@ def __init__(self, **params): "env_info", "movement_info" ,"output_path", "make_annotation_button", "nc_time_var", "nc_lat_var","nc_lon_var", + "env_spatial_mode", "env_x_select", "env_y_select", # === TIF Annotation tab === "tif_env_data_selector", "tif_movement_data_selector", @@ -220,7 +233,12 @@ def __init__(self, **params): self.env_continuous_selector, self.env_categorical_selector, self.env_info, - self.nc_time_var, self.nc_lat_var, self.nc_lon_var, + self.env_spatial_mode, + self.nc_time_var, + self.nc_lat_var, + self.nc_lon_var, + self.env_x_select, + self.env_y_select, self.interpolation_method, self.control_smoothing, self.output_path, @@ -260,7 +278,7 @@ def __init__(self, **params): ) # TIF - TIF_H = 1500 + TIF_H = 1800 self._tif_col1 = self._section( "1. Environmental data (.tif) - select one (of)", pn.Column(self.tif_env_data_selector, sizing_mode="stretch_width"), @@ -360,6 +378,8 @@ def __init__(self, **params): self.taxon_multiselect.param.watch(lambda e: self.update_movement_info_text("Taxons", e.new), "value") self.id_multiselect.param.watch(lambda e: self.update_movement_info_text("IDs", e.new), "value") self.interpolation_method.param.watch(self._update_smoothing_options, 'value') + self.env_spatial_mode.param.watch(self._apply_env_spatial_mode, "value") + self._apply_env_spatial_mode() ######TIF on click self.tif_load_env_button.on_click(self.load_env_data_tif) self.tif_load_bound_button.on_click(self.load_boundary_data_tif) @@ -626,13 +646,24 @@ def _normalize_interp_key(self, ui_value: str) -> str: return "nearest" if s.startswith("inverse") or "idw" in s: return "idw" - return ui_value # fallback + if "bilinear" in s: + return "bilinear" + return ui_value def _apply_env_selector_labels(self): """Make selector purposes obvious in UI.""" self.env_continuous_selector.name = "Continuous (use Ctrl or ⌘ for multiple)" self.env_categorical_selector.name = "Categorical/QC (use Ctrl or ⌘ for multiple)" + def _apply_env_spatial_mode(self, event=None): + """Enable/disable coordinate selectors depending on selected spatial mode.""" + is_projected = self.env_spatial_mode.value == "Projected (x/y)" + + self.nc_lat_var.disabled = is_projected + self.nc_lon_var.disabled = is_projected + + self.env_x_select.disabled = not is_projected + self.env_y_select.disabled = not is_projected @try_catch("Error loading environmental data") def load_env_data(self, *events): @@ -665,125 +696,88 @@ def load_env_data(self, *events): self._update_info_lines(self.env_info, {"File:": Path(nc_path).name}) self._auto_height(self.env_info) - #################### var_file_map: dict[str, str] = {} time_text = "-" spatial_text = "-" - # Coordinate name candidates - time_candidates = ["time", "Time", "datetime", "date", "valid_time"] - lat_candidates = ["lat", "latitude", "Latitude", "y"] - lon_candidates = ["lon", "longitude", "Longitude", "x"] - - try: - ds = safe_open_nc_with_time_decoding(nc_path) - - all_vars = sorted(list(ds.variables.keys())) - - # Populate dropdowns - self.nc_time_var.options = all_vars - self.nc_lat_var.options = all_vars - self.nc_lon_var.options = all_vars - - def pick_first(candidates): - for c in candidates: - if c in all_vars: - return c - return None - - # Preselect defaults (only if user hasn't selected yet) - if not self.nc_time_var.value: - self.nc_time_var.value = pick_first(time_candidates) - if not self.nc_lat_var.value: - self.nc_lat_var.value = pick_first(lat_candidates) - if not self.nc_lon_var.value: - self.nc_lon_var.value = pick_first(lon_candidates) - - # -------- TIME INFO -------- - time_name = self.nc_time_var.value - if time_name and time_name in ds: - tvals = pd.to_datetime(ds[time_name].values) - time_text = f"{tvals.min().date()} — {tvals.max().date()}" - - # ------ SPATIAL INFO ------- - lat_name = self.nc_lat_var.value - lon_name = self.nc_lon_var.value - if lat_name in ds and lon_name in ds: - lat_min = float(ds[lat_name].min()) - lat_max = float(ds[lat_name].max()) - lon_min = float(ds[lon_name].min()) - lon_max = float(ds[lon_name].max()) - spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" - - finally: - ds.close() - + time_candidates = ["time", "Time", "datetime", "date", "valid_time", + "forecast_time", "verification_time"] + lat_candidates = ["lat", "latitude", "Latitude"] + lon_candidates = ["lon", "longitude", "Longitude", "long"] + x_candidates = ["x", "X", "projection_x_coordinate", "easting", "eastings"] + y_candidates = ["y", "Y", "projection_y_coordinate", "northing", "northings"] def _pick(cands): - for c in cands: - if c in all_vars: - return c - return None - - # defalts - self.nc_time_var.value = _pick(["time","Time","datetime","date","valid_time","forecast_time","verification_time"]) - self.nc_lat_var.value = _pick(["lat","latitude","y"]) - self.nc_lon_var.value = _pick(["lon","longitude","x","long"]) + return next((c for c in cands if c in all_vars), None) try: ds = safe_open_nc_with_time_decoding(nc_path) try: - # ---- TIME ---- - time_name = next((c for c in time_candidates if c in ds.coords or c in ds.variables), None) - if time_name is not None: - tmin = pd.to_datetime(ds[time_name].values.min()) - tmax = pd.to_datetime(ds[time_name].values.max()) - time_text = f"{tmin.strftime('%Y-%m-%d')} — {tmax.strftime('%Y-%m-%d')}" - - # ---- SPATIAL ---- - lat_name = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) - lon_name = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) - if lat_name and lon_name: + all_vars = sorted(ds.variables.keys()) + + # Populate all dropdowns + self.nc_time_var.options = all_vars + self.nc_lat_var.options = all_vars + self.nc_lon_var.options = all_vars + self.env_x_select.options = all_vars + self.env_y_select.options = all_vars + + # Autoselect defaults (always overwrite — second open removed) + self.nc_time_var.value = _pick(time_candidates) + self.nc_lat_var.value = _pick(lat_candidates) + self.nc_lon_var.value = _pick(lon_candidates) + self.env_x_select.value = _pick(x_candidates) + self.env_y_select.value = _pick(y_candidates) + + # ---- TIME INFO ---- + time_name = self.nc_time_var.value + if time_name and time_name in ds: + tvals = pd.to_datetime(ds[time_name].values) + time_text = f"{tvals.min().date()} — {tvals.max().date()}" + + # ---- SPATIAL INFO (geographic fallback) ---- + lat_name = self.nc_lat_var.value + lon_name = self.nc_lon_var.value + if lat_name and lat_name in ds and lon_name and lon_name in ds: lat_min = float(ds[lat_name].min()) lat_max = float(ds[lat_name].max()) lon_min = float(ds[lon_name].min()) lon_max = float(ds[lon_name].max()) - spatial_text = f"lat[{lat_min:.3f}..{lat_max:.3f}], lon[{lon_min:.3f}..{lon_max:.3f}]" - - # List of variables with support for vertical levels - LEVEL_DIM_CANDIDATES = ("isobaricInhPa", "isobaric_in_hPa", "level", "lev", "plev", "pressure", "pressure_level") + spatial_text = ( + f"lat[{lat_min:.3f}..{lat_max:.3f}], " + f"lon[{lon_min:.3f}..{lon_max:.3f}]" + ) + # ---- VARIABLE LIST with vertical level expansion ---- + LEVEL_DIM_CANDIDATES_LOCAL = ( + "isobaricInhPa", "isobaric_in_hPa", "level", + "lev", "plev", "pressure", "pressure_level" + ) for var in ds.data_vars: da = ds[var] if da.ndim < 3: continue - dims = list(da.dims) - - level_dim = next((d for d in LEVEL_DIM_CANDIDATES if d in dims), None) - + level_dim = next( + (d for d in LEVEL_DIM_CANDIDATES_LOCAL if d in dims), None + ) if level_dim is None: var_file_map[var] = nc_path - continue - - # options for each level: var_1000, var_975, ... - try: - level_vals = ds[level_dim].values - except Exception: - level_vals = [] - - for lv in level_vals: + else: try: - # default - hPa (1000, 975, 950 …) - lv_int = int(round(float(lv))) - label = f"{var}_{lv_int}" - var_file_map[label] = nc_path + level_vals = ds[level_dim].values except Exception: - # skip if non-numeric - continue + level_vals = [] + for lv in level_vals: + try: + lv_int = int(round(float(lv))) + var_file_map[f"{var}_{lv_int}"] = nc_path + except Exception: + continue finally: ds.close() + except Exception as e: self.status_text = f"Failed to open dataset: {e}" self.alert.object = self.status_text @@ -791,42 +785,32 @@ def _pick(cands): # Update Time/Spatial information block self._update_info_lines(self.env_info, { - "Time range:": time_text, - "Spatial range:": spatial_text + "Time range:": time_text, + "Spatial range:": spatial_text, }) self._auto_height(self.env_info) - # Variable options if not var_file_map: self.env_continuous_selector.options = [] self.env_categorical_selector.options = [] - self.env_continuous_selector.value = [] - self.env_categorical_selector.value = [] + self.env_continuous_selector.value = [] + self.env_categorical_selector.value = [] self.status_text = "No 3D variables (e.g. time/lat/lon) found in the file." self.alert.object = self.status_text return - # Store map label -> nc_path self.env_variable_sources = var_file_map - - # labels : continuous vs categorical all_labels = list(var_file_map.keys()) - # both selectors get ALL variables in options + self.env_continuous_selector.options = all_labels self.env_categorical_selector.options = all_labels - # reset selections - self.env_continuous_selector.value = [] - self.env_categorical_selector.value = [] - self.status_text = f"Loaded {len(all_labels)} variable(s). Now split them into Continuous vs Categorical/QC." - self.alert.object = self.status_text - self._sync_nc_column_heights() - - - self.status_text = f"Loaded {len(var_file_map)} variable(s)." - self.alert.object = self.status_text - self._sync_nc_column_heights() - #### + self.env_continuous_selector.value = [] + self.env_categorical_selector.value = [] + self.status_text = ( + f"Loaded {len(all_labels)} variable(s). " + "Now split them into Continuous vs Categorical/QC." + ) self.alert.object = self.status_text self._sync_nc_column_heights() @@ -962,6 +946,22 @@ def run_annotation(self, *events): movebank_path = self.movement_data_selector.value boundary_path = getattr(self, "boundary_path", None) interpolation_method = self._normalize_interp_key(self.interpolation_method.value) + spatial_mode = self.env_spatial_mode.value + + if spatial_mode == "Projected (x/y)" and interpolation_method != "bilinear": + self.status_text = ( + "Projected (x/y) mode currently supports only " + "Bilinear (projected x/y, time-linear) interpolation." + ) + self.alert.object = self.status_text + return + + if spatial_mode == "Geographic (lat/lon)" and interpolation_method == "bilinear": + self.status_text = ( + "Bilinear projected interpolation requires Projected (x/y) mode." + ) + self.alert.object = self.status_text + return smoothing_points = int(self.control_smoothing.value) if not selected_vars: @@ -973,7 +973,6 @@ def run_annotation(self, *events): else: bbox = None if not boundary_path: - # building boundaries with .nc first_var = selected_vars[0] nc_path = env_var_map.get(first_var) if not nc_path: @@ -981,37 +980,90 @@ def run_annotation(self, *events): self.alert.object = self.status_text return - try: - bounds = get_nc_bounds(nc_path) # {"S":..., "N":..., "W":..., "E":...} - bbox = bounds - # Updating the border information panel + if self.env_spatial_mode.value == "Geographic (lat/lon)": + try: + bounds = get_nc_bounds(nc_path, env_coord_names={ + "env_time": self.nc_time_var.value, + "env_lat": self.nc_lat_var.value, + "env_lon": self.nc_lon_var.value, + "env_x": None, + "env_y": None, + }) + bbox = bounds + self.boundary_info_str.object = ( + "Boundary file: not selected (auto from .nc)
" + f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " + f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + ) + except Exception as e: + self.status_text = f"Failed to derive boundary from .nc: {e}" + self.alert.object = self.status_text + return + else: + bbox = None self.boundary_info_str.object = ( - "Boundary file: not selected (auto from .nc)
" - f"Spatial range: lat[{bounds['S']:.3f}..{bounds['N']:.3f}], " - f"lon[{bounds['W']:.3f}..{bounds['E']:.3f}]" + "Boundary file: not selected
" + "Spatial range: using projected grid extent (x/y); bbox cropping disabled." ) - except Exception as e: - self.status_text = f"Failed to derive boundary from .nc: {e}" + + self.status_text = "Annotation started." + + if self.env_spatial_mode.value == "Projected (x/y)": + coord_spec = None # bilinear не використовує lat/lon coord_spec + env_coord_names = { + "env_time": self.nc_time_var.value, + "env_lat": None, + "env_lon": None, + "env_x": self.env_x_select.value, + "env_y": self.env_y_select.value, + } + + if not (self.nc_time_var.value and self.env_x_select.value and self.env_y_select.value): + self.status_text = "Please select Time, X and Y variables from the NetCDF file." self.alert.object = self.status_text return - self.status_text = "Annotation started." - # pass bbox (or None, if the user did choose shp) - coord_spec = { - "time": self.nc_time_var.value, - "lat": self.nc_lat_var.value, - "lon": self.nc_lon_var.value, - } - if not (self.nc_time_var.value and self.nc_lat_var.value and self.nc_lon_var.value): - self.env_info.object = "Please select Time, Latitude and Longitude variables from the NetCDF file." - return + if interpolation_method == "bilinear" and categorical_vars: + self.status_text = ( + "Bilinear projected interpolation is only valid for continuous variables. " + "Please remove categorical/QC variables or use Nearest/IDW mode." + ) + self.alert.object = self.status_text + return + + else: + coord_spec = { + "time": self.nc_time_var.value, + "lat": self.nc_lat_var.value, + "lon": self.nc_lon_var.value, + } + env_coord_names = { + "env_time": self.nc_time_var.value, + "env_lat": self.nc_lat_var.value, + "env_lon": self.nc_lon_var.value, + "env_x": None, + "env_y": None, + } + + if not (self.nc_time_var.value and self.nc_lat_var.value and self.nc_lon_var.value): + self.status_text = "Please select Time, Latitude and Longitude variables from the NetCDF file." + self.alert.object = self.status_text + return start_annotation_process( - env_var_map, selected_vars, movebank_path, selected_ids, - boundary_path, interpolation_method, bbox=bbox, smoothing_k=smoothing_points, - out_csv_path=self.output_path.value, coord_spec=coord_spec, + env_var_map, + selected_vars, + movebank_path, + selected_ids, + boundary_path, + interpolation_method, + bbox=bbox, + smoothing_k=smoothing_points, + out_csv_path=self.output_path.value, + coord_spec=coord_spec, + env_coord_names=env_coord_names, continuous_vars=continuous_vars, - categorical_vars=categorical_vars + categorical_vars=categorical_vars, ) self.status_text = "Annotation finished." @@ -1769,10 +1821,13 @@ def _auto_height(self, pane, line_px=22, padding=8): def _update_smoothing_options(self, event): key = self._normalize_interp_key(event.new) - if key == "nearest": + + if key in ("nearest", "bilinear"): self.control_smoothing.options = ["1"] self.control_smoothing.value = "1" + self.control_smoothing.disabled = (key == "bilinear") else: + self.control_smoothing.disabled = False self.control_smoothing.options = ["2", "4", "6", "8"] if self.control_smoothing.value == "1": self.control_smoothing.value = "4" From e03ff8d3a28bc72cbee9dc418938aa684df6fe84 Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Tue, 19 May 2026 15:05:29 -0600 Subject: [PATCH 11/17] Restore robust NetCDF coordinate selection --- ecodata/annotation_eng_func.py | 70 +++++++++++++++-- ecodata/app/apps/annotation_engine_app.py | 93 +++++++++++++++++------ 2 files changed, 131 insertions(+), 32 deletions(-) diff --git a/ecodata/annotation_eng_func.py b/ecodata/annotation_eng_func.py index fbfdc6a..f530132 100644 --- a/ecodata/annotation_eng_func.py +++ b/ecodata/annotation_eng_func.py @@ -14,6 +14,44 @@ LEVEL_DIM_CANDIDATES = ("isobaricInhPa","isobaric_in_hPa","level","lev","plev","pressure","pressure_level") +def open_nc_metadata(path: str) -> xr.Dataset: + """ + Open a NetCDF dataset for metadata inspection only. + + This avoids time decoding so the UI can list variables and coordinate + candidates even when the time coordinate needs to be selected manually. + """ + return xr.open_dataset(path, decode_times=False, chunks="auto") + + +def detect_env_coord_names(ds: xr.Dataset) -> dict: + """ + Detect likely coordinate names for an environmental dataset. + + Returns keys: env_time, env_x, env_y, env_lat, env_lon. + Values may be None when not detected. + """ + env_time = _detect_time_name(ds) + + x_candidates = ("x", "X", "projection_x_coordinate", "easting", "eastings") + y_candidates = ("y", "Y", "projection_y_coordinate", "northing", "northings") + lat_candidates = ("lat", "latitude", "Latitude") + lon_candidates = ("lon", "longitude", "Longitude", "long") + + env_x = next((c for c in x_candidates if c in ds.coords and c in ds.dims), None) + env_y = next((c for c in y_candidates if c in ds.coords and c in ds.dims), None) + env_lat = next((c for c in lat_candidates if c in ds.coords or c in ds.variables), None) + env_lon = next((c for c in lon_candidates if c in ds.coords or c in ds.variables), None) + + return { + "env_time": env_time, + "env_x": env_x, + "env_y": env_y, + "env_lat": env_lat, + "env_lon": env_lon, + } + + def safe_open_nc_with_time_decoding(path, time_name: str | None = None): """ Opens a NetCDF file with support for non-standard calendars: @@ -491,13 +529,17 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, if is_nearest: return annotate_env_nearest( df, env_var_map, selected_vars, movebank_path, - smoothing_k=smoothing_k, coord_spec=coord_spec + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, ) if is_idw: return annotate_env_IDW( df, env_var_map, selected_vars, movebank_path, - smoothing_k=smoothing_k, coord_spec=coord_spec + smoothing_k=smoothing_k, + coord_spec=coord_spec, + env_coord_names=env_coord_names, ) if is_bilinear: @@ -526,6 +568,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, out_df, env_var_map, cont, movebank_path, smoothing_k=smoothing_k, coord_spec=coord_spec, + env_coord_names=env_coord_names, temporal_method="linear" ) @@ -535,6 +578,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, out_df, env_var_map, cat, movebank_path, smoothing_k=smoothing_k, coord_spec=coord_spec, + env_coord_names=env_coord_names, temporal_method="nearest" ) @@ -557,6 +601,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, out_df, env_var_map, cont, movebank_path, smoothing_k=smoothing_k, coord_spec=coord_spec, + env_coord_names=env_coord_names, temporal_method="linear" ) @@ -566,6 +611,7 @@ def load_selected_environmental_data(df, env_var_map, selected_vars, out_df, env_var_map, cat, movebank_path, smoothing_k=smoothing_k, coord_spec=coord_spec, + env_coord_names=env_coord_names, temporal_method="nearest" ) # keep nc_start/nc_end stable (both annotators return NaT) @@ -624,7 +670,8 @@ def standardize_time_lat_lon(ds, coord_spec): def annotate_env_nearest(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 4, - coord_spec=None, temporal_method: str = "linear"): + coord_spec=None, env_coord_names: dict | None = None, + temporal_method: str = "linear"): """ Annotate movement points with environmental values using: - Spatial: nearest grid node @@ -679,6 +726,7 @@ def _nearest_indices_vectorized(arr, vals): temporal_method = (temporal_method or "linear").strip().lower() if temporal_method not in ("linear", "nearest"): temporal_method = "linear" + env_coord_names = env_coord_names or {} # Placeholders for nearest grid coords (one set; overwritten by last variable) nc_latitudes = np.full(len(out), np.nan, dtype="float64") @@ -703,7 +751,10 @@ def _nearest_indices_vectorized(arr, vals): base_var, target_level = _split_var_and_level(label) try: - ds = safe_open_nc_with_time_decoding(file_path) + ds = safe_open_nc_with_time_decoding( + file_path, + time_name=env_coord_names.get("env_time"), + ) ds = standardize_time_lat_lon(ds, coord_spec) if base_var not in ds: print(f"[WARNING] Base variable '{base_var}' not found in {file_path}") @@ -864,7 +915,8 @@ def _nearest_indices_vectorized(arr, vals): def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: int = 2, - coord_spec=None, temporal_method: str = "linear"): + coord_spec=None, env_coord_names: dict | None = None, + temporal_method: str = "linear"): """ Annotate movement points with environmental values using: - Spatial: Inverse Distance Weighting (IDW) over k nearest grid nodes @@ -911,6 +963,7 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: temporal_method = (temporal_method or "linear").strip().lower() if temporal_method not in ("linear", "nearest"): temporal_method = "linear" + env_coord_names = env_coord_names or {} # Keep nc_lat/nc_lon semantics consistent with prior implementation (copy of point coords) out["nc_lat"] = out["location_lat"].values @@ -939,7 +992,10 @@ def annotate_env_IDW(df, env_var_map, selected_vars, movebank_path, smoothing_k: base_var, target_level = _split_var_and_level(label) try: - ds = safe_open_nc_with_time_decoding(file_path) + ds = safe_open_nc_with_time_decoding( + file_path, + time_name=env_coord_names.get("env_time"), + ) ds = standardize_time_lat_lon(ds, coord_spec) if base_var not in ds: print(f"[WARNING] Base variable '{base_var}' not in {file_path}") @@ -1659,4 +1715,4 @@ def _pick_level_index(ds, level_dim: str, target_level: float | None): ref = 1000.0 if target_level is None else float(target_level) return int(np.nanargmin(np.abs(vals - ref))) except Exception: - return 0 \ No newline at end of file + return 0 diff --git a/ecodata/app/apps/annotation_engine_app.py b/ecodata/app/apps/annotation_engine_app.py index 2641c15..9820d7b 100644 --- a/ecodata/app/apps/annotation_engine_app.py +++ b/ecodata/app/apps/annotation_engine_app.py @@ -3,6 +3,7 @@ import panel as pn import param import pandas as pd +import xarray as xr from panel.io.loading import start_loading_spinner, stop_loading_spinner from ecodata.app.models import FileSelector from ecodata.panel_utils import param_widget, register_view, try_catch, rename_param_widgets @@ -11,7 +12,14 @@ import re from ecodata import validate_and_process_csv, load_vector_extent_info, load_taxa_and_ids_from_csv from ecodata.movebank_functions import merge_csv_files_from_folder, generate_individual_csvs_for_local_ids, interpolate_missing_values_only, delete_files -from ecodata.annotation_eng_func import start_annotation_process,convert_tif_to_nc_before_annotation, get_nc_bounds, safe_open_nc_with_time_decoding +from ecodata.annotation_eng_func import ( + start_annotation_process, + convert_tif_to_nc_before_annotation, + get_nc_bounds, + open_nc_metadata, + detect_env_coord_names, + safe_open_nc_with_time_decoding, +) logger = logging.getLogger(__file__) @@ -700,20 +708,11 @@ def load_env_data(self, *events): time_text = "-" spatial_text = "-" - time_candidates = ["time", "Time", "datetime", "date", "valid_time", - "forecast_time", "verification_time"] - lat_candidates = ["lat", "latitude", "Latitude"] - lon_candidates = ["lon", "longitude", "Longitude", "long"] - x_candidates = ["x", "X", "projection_x_coordinate", "easting", "eastings"] - y_candidates = ["y", "Y", "projection_y_coordinate", "northing", "northings"] - - def _pick(cands): - return next((c for c in cands if c in all_vars), None) - try: - ds = safe_open_nc_with_time_decoding(nc_path) + ds = open_nc_metadata(nc_path) try: - all_vars = sorted(ds.variables.keys()) + all_vars = sorted(set(ds.coords.keys()) | set(ds.variables.keys())) + coord_guess = detect_env_coord_names(ds) # Populate all dropdowns self.nc_time_var.options = all_vars @@ -722,20 +721,52 @@ def _pick(cands): self.env_x_select.options = all_vars self.env_y_select.options = all_vars - # Autoselect defaults (always overwrite — second open removed) - self.nc_time_var.value = _pick(time_candidates) - self.nc_lat_var.value = _pick(lat_candidates) - self.nc_lon_var.value = _pick(lon_candidates) - self.env_x_select.value = _pick(x_candidates) - self.env_y_select.value = _pick(y_candidates) + # Autoselect defaults while preserving valid existing choices. + self.nc_time_var.value = ( + coord_guess.get("env_time") + if coord_guess.get("env_time") in all_vars + else (self.nc_time_var.value if self.nc_time_var.value in all_vars else None) + ) + self.nc_lat_var.value = ( + coord_guess.get("env_lat") + if coord_guess.get("env_lat") in all_vars + else (self.nc_lat_var.value if self.nc_lat_var.value in all_vars else None) + ) + self.nc_lon_var.value = ( + coord_guess.get("env_lon") + if coord_guess.get("env_lon") in all_vars + else (self.nc_lon_var.value if self.nc_lon_var.value in all_vars else None) + ) + self.env_x_select.value = ( + coord_guess.get("env_x") + if coord_guess.get("env_x") in all_vars + else (self.env_x_select.value if self.env_x_select.value in all_vars else None) + ) + self.env_y_select.value = ( + coord_guess.get("env_y") + if coord_guess.get("env_y") in all_vars + else (self.env_y_select.value if self.env_y_select.value in all_vars else None) + ) + + has_latlon = bool(self.nc_lat_var.value and self.nc_lon_var.value) + has_xy = bool(self.env_x_select.value and self.env_y_select.value) + if has_latlon and not has_xy: + self.env_spatial_mode.value = "Geographic (lat/lon)" + elif has_xy and not has_latlon: + self.env_spatial_mode.value = "Projected (x/y)" # ---- TIME INFO ---- time_name = self.nc_time_var.value - if time_name and time_name in ds: - tvals = pd.to_datetime(ds[time_name].values) - time_text = f"{tvals.min().date()} — {tvals.max().date()}" - - # ---- SPATIAL INFO (geographic fallback) ---- + if time_name and (time_name in ds.coords or time_name in ds.variables): + try: + decoded_times = xr.decode_cf(ds[[time_name]], decode_times=True)[time_name] + tmin = pd.to_datetime(decoded_times.min().values) + tmax = pd.to_datetime(decoded_times.max().values) + time_text = f"{tmin.date()} — {tmax.date()}" + except Exception: + time_text = "-" + + # ---- SPATIAL INFO ---- lat_name = self.nc_lat_var.value lon_name = self.nc_lon_var.value if lat_name and lat_name in ds and lon_name and lon_name in ds: @@ -747,6 +778,18 @@ def _pick(cands): f"lat[{lat_min:.3f}..{lat_max:.3f}], " f"lon[{lon_min:.3f}..{lon_max:.3f}]" ) + else: + x_name = self.env_x_select.value + y_name = self.env_y_select.value + if x_name and x_name in ds and y_name and y_name in ds: + x_min = float(ds[x_name].min()) + x_max = float(ds[x_name].max()) + y_min = float(ds[y_name].min()) + y_max = float(ds[y_name].max()) + spatial_text = ( + f"{y_name}[{y_min:.3f}..{y_max:.3f}], " + f"{x_name}[{x_min:.3f}..{x_max:.3f}]" + ) # ---- VARIABLE LIST with vertical level expansion ---- LEVEL_DIM_CANDIDATES_LOCAL = ( @@ -1908,4 +1951,4 @@ def view(): pn.serve({Path(__file__).name: view}) if __name__.startswith("bokeh"): - view() \ No newline at end of file + view() From 6cb0201c3cb47de171ce397d3264bf70e4dfab71 Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Tue, 19 May 2026 15:23:55 -0600 Subject: [PATCH 12/17] Move editable install out of conda dev env pip no longer accepts the --no-deps option inside the pip section of a conda environment file. Install ecodata editable with a separate python -m pip install --no-deps -e . command for RTD, CI, and local dev docs. --- .github/workflows/ci_linux.yml | 4 ++++ .github/workflows/ci_mac.yml | 4 ++++ .github/workflows/ci_win.yml | 4 ++++ .readthedocs.yaml | 3 ++- docs/apps/developer_guide.md | 1 + docs/package/developing.rst | 3 ++- ecodata-dev-env.yml | 1 - 7 files changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml index ae2da39..4020647 100644 --- a/.github/workflows/ci_linux.yml +++ b/.github/workflows/ci_linux.yml @@ -37,6 +37,10 @@ jobs: - name: Update base environment run: conda env update -n eco-dev -f ecodata-dev-env.yml + - name: Install ecodata editable + shell: bash -el {0} + run: python -m pip install --no-deps -e . + - name: Run tests shell: bash -el {0} run: | diff --git a/.github/workflows/ci_mac.yml b/.github/workflows/ci_mac.yml index 0c4fbf8..4fa27b8 100644 --- a/.github/workflows/ci_mac.yml +++ b/.github/workflows/ci_mac.yml @@ -36,6 +36,10 @@ jobs: - name: Update base environment run: conda env update -n eco-dev -f ecodata-dev-env.yml + - name: Install ecodata editable + shell: bash -el {0} + run: python -m pip install --no-deps -e . + - name: Run tests shell: bash -el {0} run: pytest -s diff --git a/.github/workflows/ci_win.yml b/.github/workflows/ci_win.yml index 97af7b3..abc597f 100644 --- a/.github/workflows/ci_win.yml +++ b/.github/workflows/ci_win.yml @@ -36,6 +36,10 @@ jobs: - name: Update base environment run: conda env update -n eco-dev -f ecodata-dev-env.yml + - name: Install ecodata editable + shell: sh -l {0} + run: python -m pip install --no-deps -e . + - name: Run tests shell: sh -l {0} run: pytest -s diff --git a/.readthedocs.yaml b/.readthedocs.yaml index af072f8..eea0ca0 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,10 +10,11 @@ build: jobs: post_install: - conda env update --name ${READTHEDOCS_VERSION} --file ecodata-dev-env.yml + - python -m pip install --no-deps -e . - conda list conda: environment: ecodata-env.yml # Build all downloadable formats -formats: all \ No newline at end of file +formats: all diff --git a/docs/apps/developer_guide.md b/docs/apps/developer_guide.md index 6a58eab..bd90ef6 100644 --- a/docs/apps/developer_guide.md +++ b/docs/apps/developer_guide.md @@ -31,6 +31,7 @@ If you have an existing conda installation, it's strongly recommended to make su ``` conda env update --name eco-dev --file ecodata-dev-env.yml + python -m pip install --no-deps -e . ``` ### Launching the apps diff --git a/docs/package/developing.rst b/docs/package/developing.rst index c107691..a68634c 100644 --- a/docs/package/developing.rst +++ b/docs/package/developing.rst @@ -8,6 +8,7 @@ Installation options mamba env create -n eco-dev --file ecodata-env.yml mamba activate eco-dev mamba env update -n eco-dev -f ecodata-dev-env.yml + python -m pip install --no-deps -e . * To do a full install (not editable): @@ -55,4 +56,4 @@ In the ecodata repo actions tab, click on the conda_constructor CI tab on the le .. _`Conda | example workflow for updating a package`: https://conda-forge.org/docs/maintainer/updating_pkgs.html#example-workflow-for-updating-a-package -.. _`ecodata feedstock`: https://github.com/conda-forge/ecodata-feedstock \ No newline at end of file +.. _`ecodata feedstock`: https://github.com/conda-forge/ecodata-feedstock diff --git a/ecodata-dev-env.yml b/ecodata-dev-env.yml index 16e2e07..e9f8504 100644 --- a/ecodata-dev-env.yml +++ b/ecodata-dev-env.yml @@ -25,5 +25,4 @@ dependencies: - click - build # - panel-jstree # panel needs to updated to >1 first - - --no-deps -e . name: eco-dev From 933c86b3f1dc7042a103946f7132f3d8fda69750 Mon Sep 17 00:00:00 2001 From: Justine Missik Date: Tue, 19 May 2026 16:32:07 -0600 Subject: [PATCH 13/17] Add app manuals to documentation --- .../annotation_engine_crop_controls.png | Bin 0 -> 176371 bytes .../images/annotation_engine_crop_top.png | Bin 0 -> 160059 bytes .../images/annotation_engine_merge_csv.png | Bin 0 -> 209033 bytes .../images/annotation_engine_nc_controls.png | Bin 0 -> 36212 bytes .../images/annotation_engine_nc_loaded.png | Bin 0 -> 47963 bytes .../images/annotation_engine_nc_overview.png | Bin 0 -> 39188 bytes .../annotation_engine_tif_correction.png | Bin 0 -> 37265 bytes .../images/annotation_engine_tif_loaded.png | Bin 0 -> 49650 bytes .../images/annotation_engine_tif_overview.png | Bin 0 -> 47313 bytes docs/apps/images/nc_builder_input_files.png | Bin 0 -> 18159 bytes .../images/nc_builder_level_detection.png | Bin 0 -> 6485 bytes .../images/nc_builder_output_settings.png | Bin 0 -> 9707 bytes docs/apps/images/nc_builder_overview.png | Bin 0 -> 53129 bytes .../apps/images/nc_builder_spatial_subset.png | Bin 0 -> 4103 bytes docs/apps/images/nc_builder_status_panels.png | Bin 0 -> 20859 bytes .../apps/images/nc_builder_time_detection.png | Bin 0 -> 4724 bytes docs/apps/images/nc_builder_time_subset.png | Bin 0 -> 3997 bytes .../nc_builder_variables_coordinates.png | Bin 0 -> 10874 bytes docs/apps/images/presence_actions.png | Bin 0 -> 31971 bytes docs/apps/images/presence_apps_panel.png | Bin 0 -> 40804 bytes docs/apps/images/presence_bounding_box.png | Bin 0 -> 18474 bytes .../presence_derived_metric_filters.png | Bin 0 -> 23878 bytes docs/apps/images/presence_input_files.png | Bin 0 -> 25216 bytes docs/apps/images/presence_output_settings.png | Bin 0 -> 16100 bytes docs/apps/images/presence_region_polygon.png | Bin 0 -> 21039 bytes .../presence_time_spatial_aggregation.png | Bin 0 -> 20037 bytes docs/apps/images/presence_vetting_filters.png | Bin 0 -> 19893 bytes docs/apps/user_guide/annotation_engine.md | 341 ++++++++++++++- docs/apps/user_guide/nc_builder.md | 270 +++++++++++- .../user_guide/presence_data_preparation.md | 398 +++++++++++++++++- 30 files changed, 1000 insertions(+), 9 deletions(-) create mode 100644 docs/apps/images/annotation_engine_crop_controls.png create mode 100644 docs/apps/images/annotation_engine_crop_top.png create mode 100644 docs/apps/images/annotation_engine_merge_csv.png create mode 100644 docs/apps/images/annotation_engine_nc_controls.png create mode 100644 docs/apps/images/annotation_engine_nc_loaded.png create mode 100644 docs/apps/images/annotation_engine_nc_overview.png create mode 100644 docs/apps/images/annotation_engine_tif_correction.png create mode 100644 docs/apps/images/annotation_engine_tif_loaded.png create mode 100644 docs/apps/images/annotation_engine_tif_overview.png create mode 100644 docs/apps/images/nc_builder_input_files.png create mode 100644 docs/apps/images/nc_builder_level_detection.png create mode 100644 docs/apps/images/nc_builder_output_settings.png create mode 100644 docs/apps/images/nc_builder_overview.png create mode 100644 docs/apps/images/nc_builder_spatial_subset.png create mode 100644 docs/apps/images/nc_builder_status_panels.png create mode 100644 docs/apps/images/nc_builder_time_detection.png create mode 100644 docs/apps/images/nc_builder_time_subset.png create mode 100644 docs/apps/images/nc_builder_variables_coordinates.png create mode 100644 docs/apps/images/presence_actions.png create mode 100644 docs/apps/images/presence_apps_panel.png create mode 100644 docs/apps/images/presence_bounding_box.png create mode 100644 docs/apps/images/presence_derived_metric_filters.png create mode 100644 docs/apps/images/presence_input_files.png create mode 100644 docs/apps/images/presence_output_settings.png create mode 100644 docs/apps/images/presence_region_polygon.png create mode 100644 docs/apps/images/presence_time_spatial_aggregation.png create mode 100644 docs/apps/images/presence_vetting_filters.png diff --git a/docs/apps/images/annotation_engine_crop_controls.png b/docs/apps/images/annotation_engine_crop_controls.png new file mode 100644 index 0000000000000000000000000000000000000000..e32a04ff5a2e2a453ed310278b6e2804ea18e985 GIT binary patch literal 176371 zcmYhCWl$YK*R64fLxQ^ncXtj>aEIU!+}+(Bg1fuBySrO(cZc9|KHmG?x9&)KQ$b|z)%MN zbAcz(qTqo+)D}sK3aPkdUUpFrmDfE%V4_07<=01^C0E+4(llF1+r(EgC?@_r@@ydfsZh^YY}j_N zvkhtq@jJYl%;03b1|3ahNW5b#kNMa8lKf445VAWsBJPLV8Jn_d+f<*sDzFNrl{R=% zPhCYIi)FDx_jvAAO7}`K(IfX6!V);^fQWSh<8t6lBpM&?4`zFtr%0YZsbSlr8I6XR z$CkksF;-A!&A2d9%(CKs4^Ly{ygBqBEIL+zX~j7k{cRNHODfwJL=wiPKjyXK{GLJV z>gBBOP5dBq7ecHA+rO(It^oNbyym;p0Jyq%+$8=uR0PSN^OwYFYC%*Md7A*>@#wW@ zf%2_j2!$Ii8*DGNg&CO{wJkmfhrMDQgdfB}MIMvaJW|z+ zO1Z9}IXIsfzl}}sSdK$2x*_?ye}5D$L^+B&bn;T^XUER_@Jq!#M=?FcZG>bjFaewm zjSajsS-D0qSn%A=FpdX|DizFCP}}4Q^3dAYbDn#3ozSIN0n)4~gv$mqRjs!|wf+ zj)a|ofMvCzX-L@TJ} z66HJMpcJ>Jd@l+&kP%leW7?2nkCC9*{E=_~>Wu-D+aDVO5fr==VWIrY?HdABZHs)y z(#yGiUOR!$7E}$XfUiyd025$jg@>!?>CD;%>6fJ2<1xXIia5#2X{{37`|dWDnveP(aQb z2Sn6!={W>qCUP-+Xs!5dOW;D!JqFCdA+={NwT%TejNBT(`vwTehk$r9KT(n$qrA=D ziCA^9K&~{XTL%d_R(X4Tx*ePqhaS&1Qr--DmF^nLpKcyxr^^LFX+U@~} zDUvsfK<*P%oAvFg>c}6K&~3*b)^|I#@K)codS#DX>-%J&i!Th_ICP4+6A||Pb&a0R z!S<_I1v@~~Y1#hs+Qqr0^m-o@bs01U95|brKFwzL2ko0W3`=&8O$mC;LhHXiK7&%Y zp)(Kd=1G`jHlKTztfMUxK^@&7!oJNDd|M>Wqd@Cd#XLj#N$f~F_V@(-)U=)^URlSI zw8)e>3R@!IYbKi*kTVlPM4P`fUBSC=zK_;OvZTb4VW%bc(#-C~(laFc8H+wpup*Pi zm17kf2tKeG$`RX#)w1)0=KLA~%pL`{8aJBT;~#$TG|tT3Q$Og|j@jTn z$D`MQw>5^ej|>@8AFo5b%;kki*P1W-v3I^5j^2yb;ftFx>H2W?-KF(clP3bV0(76( z&96o-rak(=wp-tZ30CG|$R5}CoJ$j@82G;EdLN-74Td2CWCQ{&okH<}pTlZoI^W2) zouPZ+cfH6J)SjGwmdpc3K|~iia%u9!AR7j-r;)7@qRe2g+@(rGD;K5LBcQnJX8P8H z%fjGa!g~-Z*1d>7NL0g{83e*R82jXS0bz;J@N*e?(<*(i9Qo$lbx_2@&;3R_N`-v^ z1=&(;Ov9ZhdJRoJZ*k_q3?z zTl5{=zh>sw^z3tu8N;QV4AI7qnR4y3?i%4jIUeTkbe(x#x&_{V!a;*QKHN2R0(Ky% zPhi};*2rj20w|WAKSoT5r6MY3j!{QEu>-5u+x}m7yi7Kz!!YwUrmqc*oiLCmjw5Pm zR|sCSC&$Y?fT4Y8wLN*wPB!ZexAWHLJ1iRdCEG$(fh!9y#4tzxGYwCc=cbNy@wPVs zh(aa=1k9y3B&>h)0(U8C87+D$fQQHwK!?ei1{jin8GgWKT(lyt5`_`@ZEAkqw9;{6VP3Asd0z@zK+A5_CGJv0` zPyJii|CTl3JAr=v{l2K>_Ir`c-=-vo{(u*~fxqb^DVmmg|2|R`59GWyn!n4GUbxk^ zMQRV^>?ewh*EKno6dgk}bb#w&*h~N;ZqK%{@V&S3h!8+e&XCGZ9d8!9DK&cDc((}& z?9Ic!+Cf~P5j^R?1$feS0`RuRC0Lu>Ng)*jDTG=z4KXaLnZfxZ(32n>viYm2Uz2(w zLp^$_KR~Y1ox67K0atp*uIAIJP1deuXalR-2isq(_LcXTcMeknfmy%!UL6Yv`F+s) zaAB1mXC;=lYP8gGM78F++y4{hSw2eO)q%11V^=c4W2glzrKET>4Gf0o;ui~c3f9j0 z4(Rjg{*f_@m12X5`E$>A2gF5BZv5Gfc&&$gcRLEfxn1p^k2kt|74OFNs{hHc7CELd z;ASPU0E*&$)6N+=+P5Vk>N7gn|G#S_P+!KU+hv1O!#A($u);~t5?^QP13|roe@D+M zoZoD2)0Rf&7qNS7H`*BJYnr=yeftOA+SO10?ihf$)4j9M6AfZ@Q$Q&2>TS}Rw;m{5 zO=F!o4YiwSyNu9%*1hx9TZyPncz66M^O=Q1;XeZzY!?q2QOQzPbve=o} zhb`b6eTjbKhPDw;*Ah4l9Ro=Ol3d?=xy}7ru7*rNvU$15;*46Cmn(m)y(}5S)iQkd zjnZPFlxckiLc17H7RX;Jt6pw3VvGW{$_(dm*6M~8y1MVOdO@p3fY6H|;2^#qNwl&m zG5&G)i4hK5#g{QalQG39%YM`yD<&Kiw&}GqGrcz-2P;LrrDRU zo2SXFLp%Ovq`9yzNsYljd-6Qy<~y=777FU>|C@bW*GOPfX{b$DzjnAs)D;9VIVEl{_!fV@NDm9po z1oj21csRwE8DuJpgoe%OFOGB48rzE#bX6!^zz`VD0Ja^@;Aa~Gk~ezWH^EW2ejob~ykg*YON;I|{sp zf5*6b*{Zy9c#>kjJ-r%Tdh3FZ1eoudILXD~e4MJGY#xwRJQPKJT9j&XMs5wMUEe0T zycTH&Vi4e)TZXkG53&cF8P}{mT*oK(sDECQFT$3%x5Y$E72hC0F=C)ITVHJ;FGc6}0Ki#3_3kz;z_<>@AO&{ zt^p1)?SbXgzZQ{1)*0e!AL#tuDgf!#iHX19WRWSA;Z?7F_n@3eLG3KMcJGMELB$q( zmFAK7^IIcE`uFo<)(nAB;z~}PjvNG8|4Sp-*jFd%m!In2_-cC3z zzjnN8pG5EUN_IbkX=Np++&X|(Hn_QtvLqhTROUYh;5dZTw5lfL0loG(sRFM)Iy8x= zG8a@xM%%t(*=e`&F|(Uw^j460`5>y`Iy8=sopHt5hmkn8CU-thbtCo9)$!yOK|E+G z>+YZ-ej2OyjkU3RUt8`*=N5qZp)mJQd)P$`m9LNB&~ zkLL_(B}uh_g6db@LZ@1SMhiimhl5LM2~wnJg&9lVg`{lKB{q*Jq`#5RF8yYMXriVT zz~JT^Wah^*~TC6Lc9bn18v;4oa`a zyCkdLsh?$TF&SS+N`={d!tY2@KaxwIaSpE{D=v;q7$#Gig3^K=rCxv%_}KMUQ2lX(no2N&43_u@h$wk z@3*IlO&2tJiPt8yo{&Z;AQh3I`R_l> zk@Aqh=ZkYFIBQv&--@2zyY74yu4X3*TOH6ja<6(pf%80d!zE8}EFP`tawmMBQh3__ zIbeu;RJrQdLPLG^$MG=Lo!p|qMXw9Shu}tzsMbfQQ(kQ(EK`JK&3A*cd>X8Bcaj>p z61QxZ@BgmAWMS=m7}fQIIju)1-{&ZfB{$!l^y$=W(x+mZpCR;Q2gY!{R1%Irbm8Pb zSC1IyhAJK4PKLqChoyfJCdQmkB)7c%T<~PZvrC%5MIV)a`V5+hlxs_WRi(%XAZzCgG?-5oe#a3L2e1-gXj$&I@XJPh7!dn zkh6@NxJpo8&&gCUA+gn;PY0AO8;DVFil$!_FO!`!SF3IeC$~AEu4EdiO|pIx8{}$% zmsy7P`@lN8{5-qVomX=!Q!nQbUBD%-KAI>GJXmX`V^uFk{$sH!il#n?o4NIH9a%w9 zfuuJKGWfht;T$*!06-<-1uU6><=A+SzHfgfBqry?xd^$)yQyjAR>m-v!=S5)ZDV$?XsI9A+b;@FojBoswSPDK1O~#Qg)3+`4?L(otB`QKy{@@W_fmYI_x&1 z(h{B0G6JUUp0tHVJWQ=1!~wnAs?sJ*2(#|Fu~mb4J*JQcublyLzkZG#ymxK?ASNeF z7_P|DatC$6`7-CsYX`2jj%|6i}*m_kInNMnE zurk1((`N^M^<_ADL{xUD1bT4lXrUl(h2`(0oY1tIFNN@gVXK&%S2b1{Gz|i&QC9sn z3RvARpq96o5&=STkbc4~RbZ?FUOvP?DRU6VpsAKflC003M=6r;(sVByt(MY&Jat4t zIrl{Kkj0HT0@_OtfkbtDf^Ph)%MJny(e@BHMH4VF*(_%MSH3wUW0A}s6@0nHpi#Vl z#9^=^7|glv>8pqG*qzFag4rMNU9pAm&wyFh6Jei3KnBSZbkh(FXM}x(@7*g@1+xPl8w=QIHi{gmThIIU!$sWiC_89g5Z2 zm`|~htb+Trlt87mCh3)NMQpMpTE$A3J^vWgG>;FS%Qvnln-NMmVk)U=7SQwNU}z`a z#oU?GrurNViOmhAcVIe*cELWcOE})jG<{CqbE?jEjIEvutQHFY@RH z8F>6Xj*Yqkm8{zh1_fw5!ZOcO_}7!GSntwC(cz%@?^>*N{k)$cr~iAu_xl2^#^gj< z_d7q55Kw+jy!!00yh`xXeLPvep5&+IdB(3{OpAeq3-BOwsfd`8E2FkcF8wd%;TwT8 z3?c-M6J~Fs<=2T8X9OIw=ZSKA4q@s#bPK9{uP{d`a=LkBil$dEPusYsJc_f|0CVFr z7)v*pg;UhpEz%uZT8O9Nt`|os^DVuHP9!n=_>|baEX<~?g2zbtL1poKltxIZ$d9ex z3~bUj3d9RQ^!m%-cfCP;|6g1Ho{)|o_>%zCq&icIz!tK31bExV6(%D72QQpM`fpXV zmy@W&&IT$APxv_Pnvuy5_+p2;?HK;@(B<>*lX%VZO0Igp>O})4=Kz6l6)FhJO+do< zo$SCGtutRDB3nwE5N9qcQz)4x>*3?eern3xSM6whU2u zb4XEBu&MK15{I5SVyw#z+hr_5Mu{Rv?j)Tmo|g4Km8So@;~0#929fBZ*qn!rgL~z8 zuY7f8UR{V}`fn;_Vr)$#WjJHM5eGOj-M##Ee@_AQ&ly}`^ej&^XhHImY`#i48AH6J zC1@eqKb}U{LbA1!Bh}))OaY*Jm%`z}xXQs<^_g<6)mmc<7pDR-6$s1Y8u%7<=S$}G z;=K==Ob(js!m0M2p1eah{ypZ@Em>4Y^C4rb<0REbYW!g3u39RJmp>R8-HME!rQZjM z2E(u4mCPGjoh_Xt5gCzRHVhY?_0@1tD5e;qFiUD*&{8`lZv!L8E^7|?2{_o6hCbN7_yNKX&XzdspVxw9 zFD8yA|E*f$h~?K)Y)CW5(?%k06Fa(rRd|7vi1(q401>}JX_P!fh3FoRM`panR7MhI zh4sgp4oA1^pLBzaG6$&krKkT)MAHsZQZOpHmd%rC6StZAf49rWD*4R^Xb`))TUmUlS#PA$TUNwKsO`K=S7x-g#@O_GJ?TFs` z1NRt9rn%k?&m9-V$3U^ls~sa1sI`Xr)shzqrcU{@v1VG}`y?I$l{0qaK#J`GF3vV< zoOOrvg3rg+yrYWEL51sGqRwA?eOPf)f|7u151Ko==sK0vcNy!(A3!OkR6)xDatxup@}DQJ zEd1A&h1l5Af6k@Gdksu*X4!HQ*Pc}g=F(jJ^psSoKA7Zrdf&HSiAXlT<^#d!*u~zSpFJc=uPjp+OI>Hh=yLp<4VTWn=P;KD6gv-ygk4Y=58m zK<7=s=@8(wG4pN2DgYgH=LlfDk*;`vu{!{{4P4=s>*(IljKJsnzfXRyZ+|wtSpHN3 z)Hv=BUj>_>|90nWKrQ1+9D+qMm|UVHSB6V)y!=py8pHd&=+Q&2<*QzQ4JKT+eE;pv zBc})&oEk&BZke}ip0_PT8W)O&c1yZqkfc=q5vtJnWquMx!D^o!@&)kYyr0^-wv|p> z6O)E6n_-3G&x50xx2P2P^~3w0#CkhP*`QA5#CgP@kSSJu;t*Rxvsc-sXWgAThN)~I zBHo#qXb_$trj?|ulcwW~X(?o0Q|Q&CR7K<;%uv)Nx8tgE{$J)^zoVN@4Hq{=rXF!w9DO+9APKp@QjV59cyo1(n%>hB}5KLCc!XAE!5YZ+&gR-w(d zuU1S#Q_g)0a?ueX=bawuY35M+MNr(^F<3G-x$#ezfvvnShLJ&|>sZ(p$P+eOKHYH7 z#^YbQC2#de4$6M%i7muo!xI4W)9W*BRvD3{fs>?1ic^6ZKNEcWQf9zT_3hiz)?nH7 z-4OjW7yFO7W!e+#pj!&zw{OWzy>{U6&bCv^%fVGtVW`Ps2@i7!hHy*$BvChZzD3bR zHlZ-TNkJobvS-kDFeedKimGA4Bgu=9frm7j-bM66IO`vyl4@#K`U#u5;UP@UmaX}B z0M*MQfkr^y8SBSw|K99igjpGP#FddWy?{}Le7kXZf3MP~-wQCL9+`=~34XYYxF|eH z8yWdH=NGON8(g7gcjLEiq`_*b%wX_szOiHA;us{1fQ<)0Vsnf_>&0jxQQd)hL;@CD zCmv@jT{1EF2ZpE;@@#sqiTBS6$F7fEc(jNE*;I{k4!!Se3bCu*jeVwK@Khzf7Qwbk z=EO^r@odit;ulEC*xY9*k0+X4?*GPbT!;slX*)WVEME}zdwO&avNR{S+Wz_EkC!-j zMtP6M%o3f=*unFc^5EGv+IuN~ZG@Fm>Z`1(zXQ@Jgst6vcX^*{J-LuIVh#6_{DaZP z=@o2wfQl$a77VAP%nvv1<5{p4=57;pv{EzmDXOi_TcK9J6!IM++H@`SEh zCCs$mmy|-wK)ECmdV^K)IKRc2kGInZ^kaIP$fE4j{{*lfPDB%aUjNsm;H6;a4*9lL zLVoVj1zJ@NswQ)SWSRvA!T5K^B5jHK`LB}&wuHi5BOyNmP_`g@VcFcUqLF?hZQ`}B zhlaprg7CXtO^tH5U--LCZ?1}jjNY-uD$@JWd14wFeg>!5aUonl++d!Af61tVi3w1} z(d9A1BlHgpN40~IxWk!BcC{Q4#zWT;T8sW4MRgXNXm}ky%HDjWQ+t%v$JESX0hIc$ z3Cm-0*Ng8&80bE~JhR5_EJYqn6MZXGv-l->3(R+2pOUrKL3lP-@b8hdQPJ!1i#}nS! z(V-dGFwB-fha$5vE=jYyV`kcrQ@#nT8qr&rl0Wyeb)z=+E&IE$9%@_!bo6{9SagSV z-W2bEsk`e8R#+N~D@uC%9ZYUOZJhVrI%pm}zrOdr3qIle57Mlvz7$vHkFdqy_y;D2 zL7I%5@k@-b#!3uTzO*=cW^v=kJM@Hu3I~Zl9y5*TjzQ(*iF8U%gfQ8TNO{fDCphga zdkEJuLyokzL_(74g*zT8jhpMP}m3JPB%H#j%65^~a z`VncjdW|=%-jh+{oAZBamb4|RmEkUSW$7ar`BsCV@9rowVH6>-R}5X8qb^vpw12>g z@!t1`>YbTRP(I)FXp-wUo?wOja6I`6i{C%ii2DbVTUh%*SxhW&KNI}^-T3|&6T&c+ zjeWa#H#NqS(;));^fk~g1d!z3+2Ne@M2KJI#;s3cF!jE(Ox}1sGud4~KIDCBnVm4; zwz7IXacIxo@E^4?Md2#mI4h{VSvnFd(j~4yp`h>VSQ52d8j@#H%kO*u9Yd z=Mh|T<2_vFeJpq5%xCH_pQ>RhP0hDb3(~NF^aa;#iIYvOlLdjs`bJ)oA_}F|zGO{r zcv{{n$B0(B+)@wKj(x5Ho!#kv0?OksM8h8saZa4ld(Ry^pki461w#dj>hF|Sn5EWR zzq5emiH}>gn}Vm0Em+3vqKZuh7Mb0a2nJa+wi)8P_!mM_hN+bf_Ak?%uX9_LCo>h& zM-j_2=r&PWb&u4>b2M*boG`PzaF-_%J?o3ift9=Dn4zP%A}S`B2B|d6QA4(nw5WeV z^o5BiEW;Ab3kum^C>0jO{vAi4eXB2~I?>5LQfM2{Gs@iX(P<+S?1t2&51OADtz_0F zgt4|2c)OJhI#}X8D#8q^*DGr9Ra zIXAPkni%6YYEHd%aXonZ?m=NtTF*V*FW%TjmZg#JaO&$&EL~6Y^=9qta)vRA%{TG1 zWT(ra;~)blM4ctV>^VN(uHk2D9yi#_ZYBqqJfN*vc#;YeMB!g2qH8HXk&vwNDn&vEl4IPpl~4|@=|%kZ?()OS zjTpe=Kax$2ADmg5iJRXks!_;p`{cho24}h_&PmYaB#hgUJ=Zmsze3L$+SnXBaxe3dXvru zzi`MJKs@KyAm!iff@4E@E+_j71)HN;=-jv<_Mb;P6FjRw1eyqK8BLc#-itil#T+ut zRac8u4NtSYIa_Vf3U75Yrj!_i%2j-6(a<4Qns0294;$>m(Xayi{u6eIburo8H zQV_}&1Jt~FCWca6%`(y8OM@8|S(I)Q60xU*T3}A~mZr}PL^WJ#nJ$!YP||_n&O}M@ zd9c0m)1@k{L|5>H7%Qc-J4IZ7@DNS^QHqCfC1W21UfNm(#D=(WL7vu?3Ps&8^ka*1 zkz3h~s}Y9+ZBZ47jp&&~HgpqX$SYx@|H&?imt^1yNR)hATl{$BTW#JsY@3a=``NaE znR_+JBh?H={V#96er4PP1yHc5KAU;?FLMz&cHyw$^lk@(dzY4@Q)$P{!xc4D(aHPY z`h%L#Z+Q7iWVMW2H5_501RF?X_>?nJoFLW%ptvTxj?$JlLCxk^qeOnn@c?IEemYeV zJl!+#{7(E9Ri@^@;Kk7#2Pi6+5HpwH3FnzgTbMONK41PKQCR+l$LLs}in16Fejs5w zJdvZ>!6gp6!$sl2>tZp=#CdPfDwY1w=7N(?4sPzW1$pxhbAyyKiPOl zy=`taMvflgM|!`xvY>-3)Y71iE}O)rS*IVCXG-Fw8%qH*SJ5xx%HEMu1ve3u(tK`h zHGp3W(-Qt=JTX*rzn>%X4!^)J;@q@q6iX**(c@N7oj|(SQiSb1qL>8w!M#lozxDKc)_z7@U@1?U7JDYI?Fqn3z%$8RNbf zIdkh=89XXFtxEn*KwcKyluD9gd~*mj-VFmrQyO^A~Q}!!phuR+Fv?i#tBH+U1J6rIb0YB#~hy0x701Pyp86@giwhon@Q>J&e2MI zv*3DIU5H?3jb|lPKNym3p30pdD|2xBMksS&xXDhYeD^paIDp;An`}^+^ZX7#Pds-(Q$h$Bsy#&KtE>K(-WfARN;27g0w>dndFRodN z9;kTedVO&fG3eypkW$LPv-xMO5dc&Do<-S!)I~(v*#7WWM$x=gWro7jt8}v$lovp( zl-l19M^C733{#BV8qnGWS9vVMImr4=g(!sLIorKsi#oJ!H~sXw;P_wb9`Oa?(f^90 zJ`jye33G+u zBJLr3?|u+R1bQa=JSF4eXkT{v@-yjH6e)~?h9iM9vk^f*Yj=80Y~##qFX_Na^QMUELLP-AjNrirz@v(Z0oSu-K-pzpN${?__kNBUzg0xsFhXeVzqMoOStHb$^7$LTlDwy zk0VwtH1>T>unuk+wbRxJ6qMY4f|49^$7487F@#8WBi0QZ>WVb7RhK=USAEnE>Xq*$ zvoDe9e_2EVhOtvy*~QheiK$bQQXkwLj`}fH5er}9jL5zm1tk=LIhH`1YzehVEbEM< z)=#YTa*e&tQH-{GcJ@A6j_F(}(gcTWsfx@RDmX8G%b0c(wf2~%N-A+Ts+eO6qJJ>2T~3$b7Ci(Se|U7+OOyGGYHZJU0Yd^^Jk#D-?z6`#ev=gtTJns7j}EwtNp0HQA-PSVllD8RfKg zC`WfF=JBE^{Y>=Cg^!HUL9s@_X$aObuIsko$Tth_m%b4=45u8)QKnj>b_FJJv8KNI z^i`a8C=H(z+HpOqmLGIWlQOGJ5{j5LR!DK2A6}B?Dki)+Om;)j28qlv<~tQpd1=cd z$w_iER)`qOQwzT!QZOeM*oMRI$%66FO0Dlz#1_6XgrZue@QB})A`3*nAZ!Tg_vqWb z{MNz^Zczi3y^vrPd$0NSm;Q&?1b4-~fK!!2D-B)k@!^q%GmXfV(D@?fG+A&-o|=`4 zW1EwIy3NU1op)o-s?240q-g5=X&QVI8t56?am38B>aNcW3|@Q{3^9N4)9anpG@bS5 z$I}*%D2;|9e-!Q3)WjyJiYn4y4A#4+M;rlcMC7R)I$tu)Hc454POC>|<;)Zf-WF0{ z{qMm-mA{H1l*Y?({bfh!tUoxLUcCT=`?*tlU(RB^3bL+(k8E(g&&u1ti3*X2KV9Va zCA&i8K6AC&PZ6~Hqi_UMhHWNB^NIUJ1{;l2nIZ-mZ9>wG$HMl1SprVofGV0MI&I;Z z1ly<|&7uoUy(z*<(nsZZ`C?*Hp8y(^+X7vvN+XNost<_Yl~l+i&O_nuMF0VW|3@hB+!L_!8WO=yuD>A zcgxyRnh=`Q1T}QZH8$z=9tG7bVtO>Xb)JmsSotwurDV_%#)TI7J#LQFm@X9~0v0Be z^u$YkJ(GsJ%J@-m38tkH> zR98llT^bFjZVePQH;gja!V5%gUV8Cf&>CnbP;i(d&RnGxR~ml{w&tA6#ybI6W`Jt9 zo-u_zZ3=?OR}q(tW2;9WilT+ey}it}YT107X&6tEDvw#69gU0;%}@=e>GF+Yv?t{0 z04kbZiS(8Kj%J;#weMcV<}=&+GlQLc*y>DtOzfY}xmn{a^U13Wlq7y^;)tj94;|jK zl)IIknbA1LxGk~X^^4_}PDF|r7vb-!r9uimPuyFTpsx8Meh4?XB=e;-SPfjLx-^&0 z)4J<8xhNJ|h@`qL1x${g_S4J5an5K$1kd#g5|L~ruQd|1o>Wh)(=?i6*DXS z!p_^ml(WgqYIW9?67hQVXIjF`9CEHJJ7hh`_&!>hW+dJj7K1gBqh#&I*T&?dp&B&K zlEhTY!rM8>e$T!JZRGRotes2pigfIVJG=t34Dr}x8Po&Bf}Yoz<(U=HP>BRu-%@@B zVDV|JqvDd1Gh4w}6)sO*kJ0|g{AOSN)y^WQe4Vh#CH9rFOaCQl%q%66TT7>)#1zO7 zsnhP^@1&LJWSPpw$U<#f+{vM!GM2C?R@4F=j(ha8!j!@$D>36& zUAT!&A*mwzijkeT)OK>GS4X1RWYBMQD(EuBQiaKHBJfj*>*5ug?ZtBuRRN5j^45Mi z@v1M3sjs#rlCybx=GG27`P?80&lnx`bjm(42=g))m==n$!jpvxJQBr2lB9WxNLpt) zZ9)vCujaGgN}i)7V@=D@=SVC~s;kNAb(>c=l8WbLB&*=ZZK6x2Ov+BXt92TS|GuKD z=hM_BtSHT7pw1$rj@5}HswV~VRE5j32ME8?)D6UmG~fTkN8PPp+s`cCZW8)$!h0Kn zVhA)4_A1uTo)eR6376nYd`ULoI`@+((1&dg;dG=mv;lmN7$g+b1s=%QeX}9|c{&1m zoB`d$#`4_(zOETy|BCO+F;UOWd&A8+9BwD;-(zi%{GVje@QMC0nCeD&I;c$aG)Zp^ zQLQh<6IA%yKcDYCiA`4IdPD}S|CqCSQ|yF8m}qOr6ye|(U@E~XTM{!ckMKv;Bi71G z7~WqCJl?Kzpz$(D+DTsV8f9W_T9PIb>yUn|#Ysn$*M{^%;$$Ub{e_x*PRjd7Q6%Zv z-q>0g#(MTlL>lxGk$36$`${;k;8J7vE?z1h44W%pSiV}!!DZWR$Qd(400Z5 zOCLq1Dm$UUcP;*|P315$js1(@dOzJLB2&KBmK44WE`g@kze#Fw%_kGvFD&4|e2n>{ zxyHF=RG&T{wxk{?bzJ2a146&68I6_mB42Ouutll6UalzIu0I`_;kHWO@5686Cxzdq zSo=>jTk*R%QZ|S@5ClCU7X%Ctz5muhwjJ`QZ46+;WHhg6sn1BN1DUF{v=px(T9lhV z6gB#v@g!GLyA;5vo_|@O7y52p*vLm}9NBp| zo9%;-1HSyu9fYze#GPde`GBC={Zf?Ak3mE>LOj0ZNvo#$jf6HQy>&`Yjo+}S>Yc0+ zFHdz-{dkOqOjqyzjw0){p>F`rd3%t3b0rj^+4p99HCmr`L{ZaC{oc9p9y7n37gv6Z zCwg|*z{v_#^^wq%hWrDdCDR^T2PPk2&g+M;)rKQ^Mx&6iHLd>rt%ZpRVm*69s&GP4*z&Xc>?lmXmp9qX>JnwPOys#A6c}pm*Bfgur zbOtXxXBo&Xf{}D`20d;H*I-x!q-eg4I?15OOoE^zhMCG=V-LG{L*}}L*d)?<0~ha# zWPxPSmCl<2IN~vlH|pu}hB*bXFbd-ocuZc+jU)z!GwrUkB*j+OCPqg5zq|cVFDOZ{_B(l+wI41(0MXhYu#Xdk>+uULgvSBcCH;+)QlGQj`CQ z#)cg+$tp{2R>#cD`U||jqny0kJ{2`xpLBEUKV!?5;Q6F)Fh#RKp}8|2w{T<+I|Mcd zMmYHd6%JX4hA6yQ4`N}#FT(hUPtYzWaLneQUDKX>055am))P5Y z7ueQ3Z`fptg52B7@ly0{|2$NC3>o#E?6{8RlgZQ=uPw^HnqM|j68ZZS z=3C`I@ojQ$-|~|xp!cy65zl7mLg6KT0JEj2N{elEB^Rl&mUi>9UV9V5a6QStH6D?n zY58qKqtA2tP1c1elKNrddDO~5&tNX64r0R&tbOR6^4o9mOw&*c^nK$d*mkS=$xMfj z3LYJL<<1uI)P5&$DwjL+>jnzA1gF~XM3nO>cp@i{WncK;GX2P!^~Hj&~BE@`O_|!OJiV;)ED8+9Fd*-`G#(`mS7`yBm#n(#wS6VR0>C{3h_$q>0rfj7QBH`cw0lbmTGBcniTtVS?m$ z`yB?4dzyOT5;HH$_I#>e&F(hEm}^hZ#M6Wj-k{M<|EwoebYc}vBDxg;)!F)^OF(8G zI4{gC8t!8znza?YCp+htaVik7R)ga0_nOB^fHTNr_>4OBCnCkg#D7y@?OsIIo$)21&*PT7V~TAVU`CvgvJyCE{fyDjO6 zf70qN2gpZjnZt8cxblzPUoy#FoJ>0V7<}*7 zHgh>m|A&SSCIvd31@uoTaC!TGprR|lA8wOk_OodnevW`->*~@CSCsZK>S8jH!0M(> zc1nB0PIOTnbWx0^O_9)Q;zLaDv1&)&*EyG<$KFveg5}RGA~g;%`c3SDYGYapZxtA# zH3UuM_Z;~~ND%DZb=BQ}{gqGko%37JInwR$#QQ+Y+zma_*YU6dFzxpqz^v~ZZt>H= zlbg0L@jK{I9tiUZ3?bsb3nTus;QO?kl#yclxZT>PP>3_%Z)R@}r8siqib z6Hh;CTaM1j^kZzJF2;=epi5E05khWt?@zaGb;s$RQL}vj3ay`OEn*S9j zT9h0+ck83lb$^Lfc#iee&8DP`jM4OpXOc5|yIX%u18h2Z{XTg|a7oJc5zhk>CXAPO zw_Bsj$Z}8lEciA-NA#0>F37pSWTFV-hYjN*`7*ri&Oic6SO${X=8hV&5%x?o+-H%O z?gl$nrm<|xQY@Kv1FMDOiw`e>s%Ui8X+<2gu(h=YE>pnwbB}4>ijq2%YR9` zWKDrA>s{z3z@#|&11thQz~x%kHI+^gZ+~idPwxG1IC}L;{rl91VW>!yd1 zX>Fm>{Nqb~b904|aK>^E59i1SGHt?A>0>b!$Abm_m0JJLN8)nc_gm$8Wclw}4OWzL z%s59!MOK*-M^n?w!88`SG(i2HK$qCulw}?h3hOszo5)MjX09y%R!_k~*ST9ys*+2q z8@*-vQKn&~=$Q4tgekp`njkt>$leE`@rA_75-x9iCi9O0mlQ>nmx$9J}M(^j2H&OeVI86&A^ z?AF(Scb_Z{|H*lvN8VeB8N4GvNXWNEe{;>s5m%mAcwYITiZT{oo zT~^lsSKME#(<6;UaH?%|tB^v2rlG=UM{ z-895*d727FO6~n(2cayrmJN@G8{K41utNxgE_F!4s`>!i&)P%O9MX&8@ zJ60r*KefLV042X1Ev5afmIyrEF#2#ZYVjRA|4&x??NadvEcREZy=?cw^eA>NL*%3v z)n++)f-G;$a_`_l8B}Y&U-zxG>XPB?1kdW@U^(s^1>*kO0AnS42&D}JkJp;~axFR|AQMu&rKoRCTyY?Z!`fQ#+CdI4=IQTjrR}{*IP$ar)<)OQm z@#e&%@?|c^Q*BMdhw=&5sGDJ9H{P1uNO5wA_S~!1j?jPk;W7EuLM_a^sH43ln$yuWS0gwQ;sU)tYET zVc2dBKYfWKnnOletMl;rXCi7DiJGRO{rmTS>FwSUdTP~KPZq*n67A&=zvXg&=spX ze#PDH0N>XF>|<}vg_;p5A6MTVE+}g?pc7(>db8Frtj3)P52Fw|e-$2OfCffd~IfFj@Y;P`KedbL`&@ zX6OG9Uh{|l_cQzbhn#Eu!+4MX5R>PBz!}>=Jc1s1v0NQu(Sb8 zwFmkCH@5!&0RPA70J!qz2w8c@WxU$~)0H%9Qn@2-1j*L5roAZEYKu|-O1 zBg8dNXXp>!KqrS$*@=5mvWhTS^(f%$kK9Qvdl7Hw9e&Fs;(Vtf}J}xhhPTZ8c>mxA2+ApD)#GH8=ozEXPY}k7f zieWl?^J!VeuU2k(T%P$$)vE5#2jKa#=KZlm88^8!MA^zFEz^W8EV&JzWKJ~k<#k+X zX$Vmqr78&~w#2Gx4adyM8vsc05+;Ji_b#r*SF-Te~x1Po8ez%dZ zvU`RsKU$RcbY^{GiS5KN7Szgyj@e7-|4J(azV{YFe_3BMFLh14yg-n$dF7sKz&8pH z-HTb4`$Afc>4Rag)Mb@E8ex&?LuN7gg*E1AQ3b$$858o|pA9a_x6hH4 zJfW&0YPZ8T9}M9@=s)M%BbKb5BipiJ$Gc#d?!SlT$+kfo#gIDfhnP=p=zil*8}s5< zP>)M8>5AsJ?x4f{=oz)m=P%XsGOW9AqGSEI#bxz7B1GAXR+hiM$$nx0y)}fgTldeE zRmO-)<0QG}PY2%{Mim`LUL-a>tEz~WRE8>yTvf*u()O65-AB{mPqzq^b-%9|=l^k1 zLm=7B%G~pR2%ERY8kK3Bp12X{^w|Ma`{p%X;->gDEm`jjtbhCF^DB#auGuvcp$o&LN7ImhI}kjWwj@3Ywo)9!u4Sh_zeaLD_pgdn!Bs3H6$YSEMhh@_H|0 z5-4sw6tX1e*|ho>)D?R#;P`)4QPVFZYv;-GXRE5C<(19XG2^N^efQZ+ZFK6^P|3FW znc9rj_n;MyT|lpp4NoNIJx7!^AHR3xG9Cmw-yZl~R^=Qbf2O86NY=y=HE?A{zS723 zwE2-X7);rku4yuSOk<$h7ODn$*7Kybk*elEqQXC=fGsW!Pt!+dTlk7*A8~_^q-ivF zz=Z7ac*}aAqJ>A+L}<#IE@QgBcxByFL_zfOb%~YTz*^txL%&jN2^Z!FWR=irGt})n z?qD)@_!e5Wxh-zRwggG>pSRhZek*0&8?#lFQE4^LYkSQ=jPD}-TbCCp$`__<1*sd> z9y|xagjR#;0mkkyvZ~@^?0L*R^3fAH&A>2YmsLI?tSi`k;S}69jGkFvB@k~+$gFs^ ztnI)sI?{(;PT%^A%sTec0uEskC`|ch(E2ydElkgePNAm@*Sx&C?!A6=>JD05xR3Mt z){x{a(&pYH{rzuTyZRKpDR}9+`6>f{alvf07D)aFcTi$>bv#`fzp}34bU!qG-Ft(t z7H^MBsTBMrcYb!s%6+Gxw7mmp!J1Z)aZ1a_(rjK7&W9Q(Yl)H8R2;i{nn}0wen0lIeKdmWvhyZ-t@KO` zWlUIQ`9*#g2yn-3WV?grX)5L>ZJWP*!%FL$(1VWkq37h~k!&dP`H+AbMjMyCg#7xmk=;Zp<5`a z*zv2?&2gzkFK>2Wch~^R-SK{W>So?+8?$#@f&o)^^4hbal}{$AXZ%_G=hY=09}mH} zn$>kKScvn{1jupL9Dh{@N8J&i#GB~CDB}#GQGW_|0Yz1Ff^yrO%!o?6g* z6)yrDzJd~U`{OdIB4nEvYKk{@yo*Q6LG;qv7J;xJNZsg51OGQWyKNfd?LV z;KBbUOm_eOJkiDv7;6NmRzD`~!^2FbKPJ&Xy31gI@d7w=0pJvX!>E4%J@TV}{Fi_I z_aFWEpC5VTpZ@hH{|?tj{_$V%^+*4BkNGQj+D8Kn|Ci;*h>N3$TE4WNt7yeZ@xbO( zwLevQqW%`L-9ysmW53jtzRMH?{_y6_>+rgp!`LeU-<%!+rndj3T#M~40@Hno?Gi`P z`m@yfDKZl~qj#FR*N^VzN}Hz3YZ6whFMsosV+d{C^KrDeI6kd4B&~x*w+E#4_^Y~n z35!5lzfe&b{d!hT?p+yD$B`tZlaTS&@&lz0&(?!E-nsM zHb)7!y^v9QY!I(tFDu(QQ(Qb>R&t;Z+f2o_Hv@l5#fr-~iKQ=X?L2s81c-N0>(P0y zZI~sh7A)SZXnONF^NN?U%V*QIvouW&=P<*0S-JW}u`*#PF*`-|3o0wG=OECk`uFZW zo>t3G%ny>(2T`V&tnI-{doXRDvaERCT1O~Z7AY$bB(DvVZitrUhc2$*|3w=nvqYvj zLg*HrvW1h~@t9l}q^hlW_wM;2)N}I6Ea|#b_;`{~jlDT9~+huh49?CQx znVy=}c;ejEA!OQh9C%=asxDg9+;kD07((k>4+JOW`HLGkDu-99mbZLM`SBY-LORdg z{#9DRGpZuvCz!nD?L0jMrwEH63bqy`9I1rIiIz6T6=rFo~Wq2TV>H5f&;>eVu->=t$RL|c-xoyX0inhdx z@@C4478kVcxQe|GFR6-RBxS!V-VMg<_KRrVGVy|B;&J)Ph2o4?RqJ-1xd|m)Tyi{| zut!T9PY$CGhfzuQvDry0xyv%5$h^?McS-z zi&Qm-FJtc=y{T!&%UO}jt0D;-JEcBO(ePqM)xzcKxql%RzD~YGuC<)J0{XLc_a|}E z(r}qp;5v}fITG+Ii9vzt`7AlAG4`?1Vw&zyOL!+QcF1AJ5x$5TU*I(C^Yq z6Be(I6X(ZL6+VB;k~HqaezZOmOO*(ewc)D_FIDxPVwMwMujq&;3TKOpVpnW@HmBa+ zkD$q(6=uAkSUD$IHlI*Gt6JTD_7+q`Qq`3pS@(KDy*BG-o;y95hE#|A<2zT%>G@bc2Ux%u5*L%oQ*p_o=cR*CT)p% zef6Q6I2eb>)6ZI*9=>$#EM?7;tBt#uoi)GRVhEutz2)VsjM{0E{Dm3%?U%5VuH5#~ z<4ZR_CoS0i2|7854qZjRO{<6#ujh+$0*M-)?Eka(*3oSxX~OsH?kvpNot=57(;YC| zl4WLwFm%$~)4kn2O(COLS63{V;}{Y%%a+9~*_PQc#16+{W@d5>F}vO>rMq|b-F^R< z{q{ZIALE?o=*WtFb*t*By3eob0TU5#@)mZG#NFs5pWg|^L)_ygL}eo#nK}Y)@Do|R zxjI*BjW1puD5&x0*AQ5hF3c(qj>1!9^pv0+qQ7hb{E{FLrL1Er{RHwatJ{viCFKd& zvNLBLyF8Lr48PrQeeBLWDA4Z@O+{@nHmzUA-)IQe=~b@;)0dTl6K8pe{tNl}0D z`Ne6VJN{@Lzamml@wKl109@kcz()37zl36_MN7BmZ`U6@Zg;?0D=H7iYrd@Of18}| z%PK9AQ5%TcqG+9AI~5;_>)W18LSUA39gogxSf0FVb-o%d@x!BFV@i(Cj|ILAErH(! z;WLC@7th!eMbC=GvN!N5J`vPje1$rbEqUD!$@#$yrEfx6a+UFlJzD1@ULA=pcS~zqBC;+?L%Vg(+#Vv<9K=#d z&x}Bwyfg`pKbSl95S)1q`tO34Tj2A&qwWb(XS#HAS^HJ{#`z~j#t>HZI$X1wT={io z9c(iE#vgVW0@?L{QXL+e2JrFmQLZVFS{ayH8qJrH$fZfzW5?~$6CU&u50P#e&+uLI zRXF>a9!&bP%YCt$c&_x&YjhpMk=s6Bao}Ku;j%z#x$*MEbpR?4++58mjpo%fUYouR z0HZ|lB`5cf+}zLDx&J0A+S5J^$4he)_%mvKxq2725gq#ryFAl+2m+f2+vLks(PT%? zPl8j=XFkL-oqiCwCgp@vclF$#xitlLDGvt0l>}??XQ;#2)k9NglSn=E;6qBC@1KhO z=`v4oO-zZZ=_T3*l9;cAZ7+MDtAtDCQxqci&@HvV6)*Rq7r1}R`zl+#-|p|jQXLJz zYDq%vBL6SSqAbdyEXrTA{H6RK&nUV95b37y5(p7U195Wzvr2Z+!tE{7>88 z{Q5V){ilEa!>|AGUw-wgfBc)jh2;N_fAz+@w<{{ zQbJ)EPfcVh9jU5->|U>|eM{NR&Z2H7QJ*8H&6O&T7N}O^RqFF2rxrj?_w_(Vtv}xg z?cw)WHfLdR>LPLe%q#YPhW}By z<1SQ|$)0K7job7#|DT_@#g|sydJmkQ0QG%mHc_)9lk=jfr4cE+BU5u@=tW;(21pwNz(@>=JebiLx9(OdM&$ZExQKt z|NVJ@6{$WHsa9vU5}DABY`ixQM&|%G-?(N+Stw2CPf@I7lnp*Zv9C$R3gV94(IV{` zd%WvI`_{{4`+4~ABzgml+UEOUC9ffnULVXgdeF2a%H zO2|y9(wu`xFbh85*T=Gp)aUO*J!ly`6-mtt;K`!0+m6^>M!(x7^B~uI?>f3fbZ`mY z<;&Hjst=!-1(4WRamu}u3sMay2WCP43$THevq@N%s_2DgtL-s}qZG%J_i$B1C#KN} z)rm=DfL@$OPV7fsqxbN`&-V6i$Z8EwEE;|dZj1ms@dl;6G6~n1rwM%2MyMOBsUi}y^UCV0- zU~Ap*=B4~@hwLN26ZE~8-sd1{_7JH5P1XYv+}~t1JB1V&!exnG&_BIZocH?JzsixBwrs!E+4u# z^%8)b`mQbXx@gQ6!s&I`Q6IsT|1qP^bmHcj*Wl7Pa)p2`Jw6614?l>2K!qv1aQ#n3 zBJ3-XC*Bm!Hso|%g-X@?@a86VUM!_5mZ}ThR+MRIIXU^d_xYQ(+$z6RZM>v?&*5v3 zy5-kjZjeX=ckT_Q)y3zxhi5kKY}6f@2ZK`}m?d?@^q-loK^@cH9t+)`{-IbMNUI6v zDPppXekr?AGKd#|n6Nu2Tfc%;z14UOYRk||;GMdQL@8S%(8e&Uxm~CF>|XkCz6?DZ zgF}CG2I=`t`&n*csc|K{uQ7U7C0B2g9a3H!?|Fu{9*#Xnoq28&G@N=7%dCi?m3^A29v%Z1X8^mppt>zSDzRm ze8mYBmIr1wlDGyEw}r%SLy=URHYdCl!oRzy=XWVug1C#wZ*yfCK9ruRdjj?Y&;>w` z-6P%j2K-5XgoxL=+&5eob91E>3Jd&^!Y=o`HB;VH8&xDQO85HLc7>ll>%^`=(eO#wbnE5BI>9-hB|s z&2XpG`!YKs==;{;Ezy+Rnxp67#I)X>@W%G~XUaZT_Q7=qE}Ne+i{lxxxQuRRuFV5C zuf*zj@?P_`(F5biLudbE@MHm@{EDdC6qUdVd zWpIB42#adNIfBI2_6z8{CSw`3K}p&*yyI)qhC_rJPgZ{Q^;d`B@_QGA3gtwJ;d||| zyK~^s)jRR*-LW}3A5KdEQ}zvWy?e zPQhtqyfGfcuJL6_H)bfi?}5uN0k7Dw5i1X-mk03`Q6jnO^oxU!K*z1Auw==Wd?j2k zuS|nm6X5zZIA@PwIzJ7L+uxY)fiPU=&o{WS4UR071%M!Eb77;>J^{=I z(gu232)jHoL$yX+(|GJIG%p6Pj<2Ij1CpenOhq(T+GTezkw^zOCRfK&RFOi13#Zf> z-=%*z1!rYh-{Y^+EgR@H2OffRk3hO|C`4TEoz}QEPaVnFvx!xzKm8g$l?warMT(Tp znABNdcI3A}O(w7x`F~LsWl)RjQNxrWpXWstR^SA#0SXnT$DhQVmF(r{_ zap5&N@NB?Mz!1lzB_jNq?CsV zjL>fM>lM+8R^!zka|e+!x}rn?}LBllt{p)u@ik2r9$ zC%X3InZior;%*s$|*+y>F$rtEp68DcfqY`K+Bh{#T%D=Tjs&-h)-{YjQVnP zmLZXEN^Xe4J0pditJyV>U2o2U9l2$(g8DyJ5B80tZ#;kR;jNdi+poXcKL!qugA-FI z*k0UnHk?r%%#=rpZ0=NzFSi+0U&D-%>214CE>v9yRo6h-<>{K+b6JOx72!9$${VT4=xuCL}l8(w!0i3 znFT@YIzNtbO<{YJ-G4i2Pi@2wj_Uj~Xi)C%y0nsBw2Hd7=Fq(x_M}$5_s33+gTY6W z$6h1n{N6|C$oK2Y!!bO4#E$&)W8nE5NJ=k{Wy=G(&0%SsVf4D|li<+|NXS=(lZ(Q5 z`f#3l9jEN#G`RR+hMp~p7(hO6>^-$QIWvM% z?8A^dW2Qf_+m>e@g8aWUtq-053zT2WdWIHIbowe*ZYWLT!)SA4S>4iXt~`w&SN3II z)A3Po@G(d%X$?%xk4~#8z4Gd)J){#fegyCFC&>%fmov#N&#s3 zr4XPOZjR@wd!HfLhLicw7&tUz4|)eE!{2TDZqAPt&F()GN3$CZH=o1#pV5DQeYQ4U zpmo_+=7uTU@v0vh3?~54+?or<6&{j~grPSOXjON{Lz8nM@`bP(U6Lz;(lo)WQn-pe zTmZ4$8dp}83%iEac>!L~c6}r|y@AB3_fD#gWJ${|KC!--f}bCKHhFd$9DOr;Y!-Ap z9Jw+F25!xMpWC>aRiJw?bsT_#)+1|axk1d50G=$IZ-^7=z9_d2*dxr{c2c8-74edW z`WsWI=k3c}-@P|C=1?*}IJmOo55rROLR0pvWmg?~1uo42N{ML|C2uXSA&^{Ldl3aQ zx8HglhvgC_jlbbam*i@erPqh1$r{eTxeNd6($prrXcaTxdS~wN8&J0YUNp0uz$_y0 z8$9_|Nd9iP&570GfVDVs+MMi_OWXxr{<{VU{FWt2^}!kK%>I$GTcGGF*nMWa`VPoC z`EpBTr#qv{gDvw@NFOF67T^ zFTgg5G(4felhPQ$R}DQGg?f^-+qjmY3dNe<`@UjTxy3qTV`#hN+&r4j zL-)b$*Wk_s_%fp@m@4z9)}+V>A#k3a0%7z$0hpAKS|83at;UTPr@+&3z{{_U6Nz%q zp^}FUr*5z1G(@KAgLifie=zznYl+mXh8r(oKh&Ik;Y}?L%(8NOZ$UgBICFVZdim1C zA}4lNfN0%}v#%QRn@M4p7VX2BtQiU+u2 z_t`gC@6CX#&%w4FZOD$oH6qsSk^dKEQ5I!U z7Ui#5{xbf@HLK|bx9unGuYThE>9_v>^I&e~=MaE@PXVwnH#<8!KM!6!d%5+8ZU68$ zfB$#C`rE(%)vw>a{>`uc_V>U41DQe_o1A*=*tq@z$nVubbAJwo!9NT10kd&Hw;FyYC}27P_f0AXK`WbAzyfB z_IalFt>CmRW4A#IBp8}HjQXI|Vshgk{J|$6zWKrIzZg!iJM($8WP(Oh$&(j zrC(+%1f~hKk1l*Vc-{l!%8P(CWYT1d`aHveD9(>pe`h_;P@0e|KFBYwF*}Ti8>vy?g+8%%By)es+?@& z+C)W^aL_%)xKX4yJBe)i9lJ{csKx6GIu6XBbPN3K-5-DV;}?RWXi8ojIVUD1Yb{8O(_+z$WJ-0GoTYBZ;2Scxsi~dGR+J;ooC+uBok^~>~ z_QoV;z&;OWYu$b<{VLmd_yM^2d>Y@Y@=vYt;Y3Y*_#~AM3mX?_}ua*aus&FYsL}xdp4Ryw7$Bzu1xf0aqHpPz0tA5>w@e zAEM+Itb9d$N+A)eb-`7>EM4P8!}&TA zzxf?j%M#&F{{K!w3yClPwxR35Jj!ObmYEfpR2ePl@Z{-S@rLD;y=!vy8EsclO0Y*@ zb!u@evn*M6plb%@3F9ekYZG|?#w**LDmD!tz6$1QPTUQnS9$HM-6A#|do~9(ie0S! zh+P*&ZwlwMe^E1lDjG}yPMLDk4(66b)@M|q_RwW`ho$>({9pKA|8;TOSxCh*5ZI=N zXHa4b<73eB0@Xw~WIs-rZ{~Af=c`YnglE>E-^E>ecyO%gANl)b-q*0cRhBP+GCCu+cYNy;Xk{ z_GjyjF&|E)GbW9dw1hD0Io)T5p{m>+{a&&s`p49@G;TDRy_vf!?g#!Cc=`LxqUhA! z>RXd%0jM3g^geS>7*@G*m&JW&We8K6qDFq~hNn;0X6%aQ7XDDzv40-HKc>=eV)5Qj z5p7Nom=2u2y#UJlF2X6Zj$iNnLpp>UsMvUYNBH)%SZ38)k%DVId*BV|y*s*rmE+H; zauX{&^K|bq3;kI|^7F6Jhz~#9#N88;DC&KM*0Q9Mp4C``551bi(-OJOaFKGw+u+*d zF6tpjdR+xwE7&bgoE9g{;=(q2axK2XrU0SUpKtW%YC`x*h>HQ-#t@O-pKowysa=^G zcP@hepZx!Su9}X7XZFw0f2Y|G8TjTDxcCH|e+_!ZfpHXACsD?fixc4K0;p*@y&0?V zqRAup%F}Pa-FdKcmnk|`5yZBUFsm1(MtbHoT*3P;+*^}?t->;roB9UH&8zWa z$b5NCaAk-Un}_Y`{dN{LMpV^g7Hqe0Cb;7LDhW2QcdbaZNC$4kb67n*vuJ0CsL=3_l$oKv4<3&p||D z;X1Zy1KuB+Y>E@f0_YiyS04|fK>8Q!vNZ1`l_eNY!HahsI}?rNd2rRs@IjJbAeNT* zU;(_DnP+5_gs@e_^e!j9&Q)k|!JFJUtwgrj7gv;A8i7>YaP4*sCq06_$B$9w!&iBT zbk3QYcO?ykUHVV!k)OiQqls`%r8~JHhTj*oL$NV6|Ke-3R%2B?KCIfz;U{oXcV4+0 zNhx2S)#dViaV*~u#8UjRq!Uh}Im9efGy zjf3D6hT6z@#56c$Dr-n=o)tJ>a4O8D70amA<5XZll1{3a|=CVZL)9! zJ@4A=|04e{%AzdFqAbc^r~JkF z|LD2*?7{z|h}h@ne)9L9T(EBh%%XCC3-e%Z0oY3bK$)AKpPmBq^N;V|C2!wBa(4Oc zumAoZe)X&0{OWI9mM^E1Q*T^{kLCICH)At%)1&sIzHoL$5VOjk^B?$s3Dd9)Gr03D z-a=!LK>J0_U;w>hqgZM<`|SKA@;8U3yX@ri&1p)%q@vGD`&Q+&d(f2-6J{ zO}U11ljwYPTb^P)rz(t$i`PEL1JNzKecRQ(LUo{1e}#>)0c_VxWu$5T5&XOZ#a3+$Ga`Wohbf1kY{#G zR(SE+2sla@M!->wH!4Mj>wVc;FOJGx(CE&C?AqbY=p&#!%&oBDu(7BNxX4Q2NTru1 zZ%%@q3nS6&(g3_HlwQz$Z{hSLIBd^%U^#zl6J8TSuJ&Qq)LkCGGX+`>UtYy6^KC+tixAtqFmN9!2-2ATjj@;vYmkGXp`VTJ4P;!tvN07DhR)U09F zoU&WV+a=}kBC9*r>WZ~R@Xc3d(eBiaLPI1=>OoWc@l30@(z6rb#VptUYp;{O8d~H;6u`9E|nP+k( ztBX%e!&dBj@MI&e#3!|w$gbDEu!qPFoQwFDvyxF3!PgKlBO#-2xu|!kuHNl&v=~ZmAf+IbD-GvJ*Kn$nD%vjt&_4xMiYnqb`DJ}4@1SJna}kN6PsI(eyIg+E zjOFj`ynpWonCrVYwuV(2!dHK*J_e`E{^z6LN=nvK^CH;w{#=y@UY9X=_wfQkg=lsq z0jnhnPEmSHkax+a{f->yAy6oBD3;QNfK_0+Vr%#4*-)=FCT+Qi+T zVKvb^^44=|@(g@=Z_&C(E1eqsZ#k=d;nQtQb~B?U@-=hU3QE zhy1qMUnF(D8;Mss^JN=mSyON!nR3P;dN_xvhT~dr*&X-6xfy_!99+M%K0H|-j<*o; zR(pBUj<=;s?ebH#1(d^|i}H+ka*STM-ixF4#;lzEzXGc;+4I9|2X1coPV_0eGK^8~%c%@cmj%-EE4$C$nFdGhfK`b_p?pIa zUvca;T0nN@*;Zj@UsfxDZS%*~Ri{SaQ0~9}B7P^fin}Y(I?xVP?)p?5wvFe*xK&BE0~(Y60GR@SPWk}vp`pHOqJJ)V6$4Mf^A z{t1=diDgOk`{DdLI|*W##c^USF{Lt6Xb8h(gRfAqS(3PdkRbd@(ShpWO@fbw(%>I< zZ(tce5?g$>^FPfiZ@oJTkvpyBR1{ANfj`N5;T9y@%lE?ZVq&^=g=E-;+aHIOTppVl zot!0$_Qr@z&fI2au974%x-j(~Sd%ZWi9oF`I`{MloYc1;uHQsl?C`qDcis7)JbrNd9r(6cYx5i%B-m|aA-xpSf zvMQoE6|2RyA*{Wkb`uJ5EI;<%VJ9;oK^JO2O9R){|3c7XPWZ9p(Q^GyEt1Z*}Ciz)u$We^C}?Q5I!U z{u<;jbmc$`)94}TAZGUdo~&|W8$xnA6eFnQ zQ+$p&gi#TdlDnB(5}mwjExRHhu^^Vy5Y1PcMuBAptSdX>MA!OdbUR@dC#=akV_+Gp z*^$3*3B&5bHU|pL{>+9*T*_;@FgS&(oAr!=bz(&@xp;$E;rU(q=ec@@`f%;=3kcn| zXMkzF7{OPO=rU+3yJ1F@`il2+<+A^Z{NKv|cm8jB%m40NvujGZ@gDMkZ8-646;>U= z(>XItOVf_LBR)(JcZIWM$Dc0TdINsQ)5o%9zEt@a%8bUd5@8!ekxK(2-+efi93H_ZIcN#~JYEx1SjU_a{JdPV;7dmoGlx zC+rKN)*PMy*B8K#yK6&mnGaiCbKV}Z{%ne!U-fBPeGsK2f>rfRRbTl95W-Q12rce( zt(O2DKmJ@P0iCqiqZ=%5Io}bbVzRii`v|N9E}ULBPCNWB__ny!V^fj z7skTqMIMyGP*xdPvH#ROs*nep_Wkbacq%k!)o!%vs_XMN=0U^0^WMn?-aMr@$54EH z?EVb6_~5y)q4i^7?P~g-KMKnwHL6pOUf+ixQNPcLT<4e7=}N8pAhYqv?c0wg#}7W7 zgw{OYbR3d@7*-jIReiYIWVnlVn9BOEZr!c;ki9p2n`jfiJfriX@(v)9Wx*`nI-cUl zD{yNHq!egkCAN2x^(cKeLw#ln?T`FW)D**4`|z7xD5|w${n;6C=MC&wWvo~hm!oOB zkKEn+uTFj?stKi+KpcI4M@D7S8R_|H4|*d()Z>L2hMuDUjU8F~D2g;7qiqGx5+LrZ zzKnLulxHtTGxi3v8v@cSHFxY~0?$ABLRkG#rYaPZFHfvpfmz?hnwN819eB;|JiRwt zx>eqLa1NcQf3v$flD5Y;b&oG=_m(n6@nuv14@%|1SDSHFWMcVBR&Di3dx?iT;Oh+e zs_nbK66y2IXHJjIUYrFv?dLaVT2~31BgwT;Ew9gmqLzcJsf8OdoBimudrsYge`vb) z>Qi3HhLp6F!p1`D&;@%XwmpM)Ll`xFEG3C;@Dj8Vg`Mxx4DV6&K@v;XB+{rn^NEd= zl9lYrP`n~kQcs{3`zMrd66kwI(E@?BA0{R+zbmLeIgb2=O8d{QftpO-yH3)Os%*bF zGj(lh?EE?-RB`WEx~u|HFxQy#MKZhXJHEg(Vd%jqU^I(%_W z<8>7B&~@cyRFW`;o1ff1+y@8Z>a=e{fiJ5eNNn@wcKFd%7wrWFe#qRnp3)MRV(5Ab zj?d2O?_S$b*yztK4`gJ%pI#z8b{F>N;XB}So{W@Q;Vsg9C_mH#*QjYg-X-_{Mzkr{ z=$j_jPayI%A3F9SGizm1)+d~b#(t;^2x5OIXkE$Et&y0&E9rv2bMV>1mj!0ns8v}_ z&Iu)}b1cn|z>RUhDzwH+RFPTQ#!GKddAu7lJ4$TfTMGie&)6WyFFkPL>=Zcm8f+2O z`!g#&C8pH28;{U5f96TfcM~`LD|66=yMKL}?BW!7ISXjJ8lr^OcPOeA%my!!j=(kh z@LPj~t$vui?82*4P(w~%-7Lrp-ku&sDI@JHA(5;7g>tV9rCXZv-?RshPJ#W;fET0E zU(z1P?vKJcK1tIaehw~-fz8#O9Eer+yap$q0dk&s6-O2%)~~=-{^_Qw<0DYh z+U|mtykZ}`aPO%*cL2Efa*AB8i>K^bMa|uam+*{7q~|9h*mZ#%l~Za1QDAcuv_kO` zx4SZRTPlyVj-pf-yed;{a&9QZa+aLH*81>t5gaKdZ8>jGgppj?;gwt-!mK^{#$I;x zF-Vp6ucp*T?JWB&Q)#&L5}G3?o`4Okl1RqhAZBj&C>q4!o8#nCYaFZGGdX{AfqBpV z$CvE2BtOqTmWW%KK!2q3lr$g{D0*CS@b$TF}5)M2F$&6 z{GVO0%WD9RFMueqib$&pO}DsVN&?^Nz&1JYnw^F1D{u=zV)Njewzk}5^gZ{+8e_!T zXsr5uNu#j+@>gY@LG;=vq0WmY`$l~UKR6SR*0GeL_7-zM1zrerCe>?vl5&w7oci^`}@f6$qIBMe~R4cA<|2@B?5{Q=`X#@Wv zKJho~1Me_wQ3As$d*<*Ti)=orwLZAjn`tC#PWRi(m3K`5^UXQ!)yYA7wWNJ70j<8v zKe;%9qY7dvhUO68ewC$xMs{HK!9TF}`AGJioJEJpKkVuVPS!c3*hswYV744~2jsVJ z3N$`U8G)|(prB{p3lzY7)1E}3@eay%)^E=u7M0QJ%WHCFnP9&!W7?d#J@(QDJ#XtJ zxN{JDxbQ7Y#pb1=Ru_EG4Lj_}9kAyO9|7Hk z(L=9Lwr$u;bw}^{Qp%mV&E5sa-lM34acPll|5JMciSFlMOSYbryw`_S8zNT2*PZI) z2kr69m+eNR_VN_u=feQh+yF5dZSF}8K~(9$2+EB3Ri?rR)42(T%4h`5V=jpG%-aiEf=D`7bBp( zqidqbiWsahvGK^haTJ6<@B)}GqZDFC$5By9*ijY7o`*8({b{OzX?bVHgAWr65yjXqa3rF7nb_oliw<0IBCWolgL1~;0%hAA+{ms1@1^AT+54 zw%w0WCA$eu&4P;y0K$^y=4AV0RGy>t-WXK8fzemtskDFu0Zl#BVc$yV8(-*nGCDM3 zzw;X}U{97EeL&c;E0QCNWYre--n?PgkoJeb_7c3GYx7Gj`%`gCuRVeJmDix}I@*pI znnm|BI0`bXhl7*S!h}UcW*%%Bob2Ca?+^b@z6R?IptcbIjQ{`KPw6fHLy~vr!s%v# zzzKzjH+f=RUd%pERv!`TBVc{*>^?WV*M$S6=cgV8@qdQ+i9(sfl2R;Jpk_uKc zDnn`20UVvbzyc{QfZOPsoU6Gs3CXZ=c-EVt3*;L*N05?yUo_~Kqz_3u=*jJLr8Mk4 zIS$*>c?ad=4B}RN+jOE8>it|5Pr1&(xo6s_EwRksw=e5N>a9e#LshDa}6 z&NaJcoL-V@i{jLso<#u>e-g>O=xR8%A)5MTnq8QB7pl>XqVZ$MOHaIjx_534oEZaW zUV<~PP?FoTvp{to1TpJY(5t^Oo`!=yI01Zl3P+~FHPsfxYzw7S9hpW;L2}iQH@hJs zLtfBza;KJ+z{Cu6Lh(OMBrj;c(pMhUzyz(%LU**qjAW$oZ z?Ao|vkBzXS$djva8y!EMrchx-w z;?s?O9IZQM2roGFF5T$FX<1IwE=`tk2FIc19C!`3aLU4H zFQ9mzSAn=@rNlJ&7+kOy)I2f+4vc`#`=I|7y0PO=0JqZSyEQWgJ|OTtwh=V1 zZ;ha>A~+(35pd8xbN7z|q47#6t=5HB*csK1BoW} zVXM7(W+K<>%4xQj>g#a)`8e(8p8P+JhyP~%KWK=1;bu?#7yj>9ff<7&R`q>k%@iLS zjS-nVFw0V2?~>fhze_*uBpO(~ySwA|-0%qaw!A+88T4JyXbz;;r>IWqpP))qm+W<( z22sWA$xrgEzA44tbY-BR*@vmsJ^&}?!S>3&0Nm)pZvB_kCjYE~j@O8J{#3L-40b)M zBQR?Seueb~=p6&!lyyPMbxKkDGqj)PcWTZ~oSH}09<&spNmqLt>@eN(V5pWRHgy)&x<(QZ8Ey25O zHCMseaiAX_jZHVW8Iz8`!k|wmc z&{eTQ{YG9*QQuWa?{Cw{LGu*a=wi!o6sO*krSi-<{0DBYqoCVIVE!;m0WqfM8OZFq z3$=M-OKpjIyw?^^TZT5 znqUyTx7~0h)lu(&i&1zJXZ&!I_J6?H3yrO$kD3ft6+vKWbwsIL+t+u(&U=Y@M5Uhy zgfE5FzmH|t)R3JDo=hqpLQoUp?E+UUMI90?l#>a7bO6VK2_ceIlu>fM@cb2Wq!7ih zJhE`K^)7GJ$qpUG^K&!2ptF|@USRHV@lAU!`~BU>NGGXn`jGhi5!JQXVZH~cudy|6 za81wbXM@f3W_oYVNyP!VDpZgp1# zDL3VVjzmH7YxK?gGmq!VPq*NoKTd6>t&<*v9VI_%cx0dP6bs8ANpJs1s;p}cT>j~9 z+Za)Geed%7(p4?XT7ULMbCm`7Z%B3j%5JWMSU=wlc;ayM$Y?Rse&u_~Dyqoh-zskX ztvX9(_zX={5fzlOp`mKpx|n2pZKNEZju{#^`S>F!l1sZpat-4+<%cgd`*w;RK`p05 zdsMMDu88T8-q2!Z`gM+q+bx#wd5dL8gttCzCNcajCdlk6)O%v_`J+Eu_7}&q&s7SB zhb_N@BMPvIqfYVZ+PTX6IA^l&q^bq1F$qps>6B zX`7fqJLc>b4*$(>>JoKMNo>hOp{ccac72|)09LZ6ycjMwQMcGi^Qube-= zF0Ssp2_|ULQFj(2r#A;Kn2sre_Qyje?Zrhf=p!y>KH(j|uW1l~uRVkcHfISuOu92z zh_9Y6MAx*1^@vXKE2_{Q4aX(B!DhS;Hyzycu=~l=L!3G;3_a8B~GHT^iPFSIU5F3oGIiYs9AS%FuEKC3G0y*uRqNmS1o^f zsd3M4W5i4|u`ewB5LWG@@N;}zi_oF-@YV~N;3|tFsh-~we!(4`k{v9x{1DOf%{)(2 zw4B+lKz{URH_I{(Dbh@Py}y%vF1x z#TJ@d`>s!VwtGNvW}i90<8qLS;tmor^!oKaK=ae;;aUppk^6ynW8p>WG2F!rFZl|j z=2TBw(1tzF<$yL4zA4ukC0Hq;w6(%Cvu|Ycmoxm26VzyBJ{*YzB;--<7ObZG!h+mH z<6akZ*`$px{FZ!_d&Ms&fIlK);UU&^fpl&vN zvYrM>rjPi#Gz9*(nRI^WF77Pwlc_*#_zOyM zz-i5N{nL}y)=8$ptX+C&@Nkg9F!!m3XJRlO(jHxaWIb?qGIG*qqw=0P)x2fIefy&| z0?qsbncy4I7TVP&&v+n*&^qKK@@xWJ;kI{&!0cem9yRbeKo@PJalP*NnrnsL);5ZNWvG@w3PttdH1lsx?kq zdw9aVZijLXEh{)ZNpB1){B>wwWS8UH{?+I6#03pBI!SLgQQPe1EOCa9ov~hoR+M0s zI&3XPD&o2F&*sI9MRC2e90oCj`VJ`;_xy*ROowe(j_jY$uJA^tK68rU_9?|9;P`fZ zaGZ{>@+J5^WBFxC1KfyW z)_o>wufJ|lBYOcpfyr<7x%f2bM{liZ+`c(aLn^V>^yGydf*`b<0n|A`9auwFQN=UC z*fDRU2ru)6PeE{IFR0%Mbn~pWqe&J2Oco%b%e;abQLV5AX3vD@%43iF7)lIl=YvaJ zb4x(MDgZ8}LNB-i)WM#TBI2q|TsA?lzmJ#+pk-oS5z3tP=aJ{Eq&Q@* z6i{aj22ziz9deN# zwHAu=V5;&x^76HFMH^~73Mu?hy&5Wq(LduxI_m?UAgOgCRjY}mdopG7s7|;dlj2E^ zpwMa{-9Q>i^dwPvg>=3mLq$9Zw#WQSqG{bcjS|52njh~;dI@E?@ZBz^6VD0XiUjTf z$r-lmjklYLn!EsR3pEwGObLrxve8v_E2H>!SB~cuShp-5T`wH=271h@TR4(d2De&3 zdP|iDPBr`3OW<)6@|$$0EkIlvyPplQ1T_U$%$7g54JyE#%67yD-3RFRao_H^jm{wR>!Ax*-F{I2 zsfwT@!Xie$ntS!Bc8T*%@1iU3#9dCQpzz_7B?Q!;TMX3&3loWL)=f>99#gyaVbYV= zKm@uqe3{^aOK8c8hh%B9tjXdsTAhdQgLleqddpPgl)es+nBo! z7Ag@e*TH`!DHnJ~EQXsuAh>=gChwEK{DXO2OE{a#M{RlCnvb8X&DeA=!sSg2`sK%J7zW^;$( z0&zP{&@l0((sF$3;MkCL%3~_nOjChpF|{^94!5*JWBK?F^P`iAIU=IxvEWl(C{>z3 z*rZm9(sBq;PkqpRAVZ<1BT*ZTEGEN~lAXk&&^YMC>l z>%_Sp^1@3W4HfJ5l|}SP=83#U*RwVUa7|+cu8_v88~odkCCIdKw9j&#-K1%s4a-m0 z)7qoEbSg{6ui#u4sz(ibAI3d{Jol?4>nrToEz59X1mIcE5kX?I#wjeUFV`nNfc*5x6iZH<@@Wl`~6p4i72D zM+Ley^4E?h5V{@}tD3S5n6Q4ZmY8r1o%WbH%dl`Vq4;<`Xi6yIOd)j7z1bW4NVM3` zcbrwX%A^lE*Ps3^PnFSHIYN5rYvO=YchDW*6}JUunyH7ItJOG1g`3dL2tbG`>3IMN zQWL%vHx8}>Ev z|FhJv-w?idW}fsahJhsH!<7j(FIOg5fwZj3iDLDVfXvM+k|45rTBH% z^{M#DX5+4~erJmR8b0h}crRO+5qT6PR-sTDw#>?Vyja9=r>-H4#}Z|MR8A(D2B1X& zY#er(dw>q85ByT!JfEMKUu-J?qYsF;9lAZ=c-iht!mDuL<}F`4(Rb7yK>N%B$E*UW zi#=Qle{d?uE_GjxT#rwx>A%qZmv?5yYSQ44##@xSmK*bvB)$hT1#ANDd z({H_;=_FFjf0Ut&6EPA%dO zRNU-qg3Pvs_p0U=4@BD9o0eq68gDi)FbteeoH9w7iqS z_e;hNE9rgtWvjpD`TNYFd;qjb@PcqApRfhgzYPL+_%E-PrJvm}@)gki8OfUu za3(sv^$3`(f~5&TgiqU~;~o$$LIi4UR2$Wz)G($fL(``paq=NA4{a5#V^dQ!KNe0# zFruJj{R^Zm-C7r&fmrFdJNKnBC(beZf1KmJ=1NS5bORc>96wPo5p5qPxxWi9IT!#E zT9j6WP1!@h*`v(G3kn;JUU|{NrWi$RL19q{Ntx?Q)-7hK;%w4r>xVtf#JDq`wCW*< z&0lR)+$>jt^|ue_0n_+(dq__flkK7>P;i@@1ggT0z(5WjeREJ+<_ge?#RN)N88LIN zKvFUYNnC;T!_3XIFc$z2WLeWdP=i4Guuq;wZh_05v!fOs;sA$)zCpN@n!AFMV4_wq z+armbJnTUSgvY1p4P|gxaY&ofi(-N{zMpt^eAv?VEQM_9dVop(SusfeZ0j%wO= zXk1NXz%Bcckl^ATq(=uq4zey^x`dw4KTV{*9Zt4GZP+xyQlzH`k_0%)8iB{lO^QL$ zv!}T<3Lt_}9ND3)Tvch$?jT_6&ktYwp3QZg&R1%;xdc~ysH|EpEYt*aD3s0~_P=J=^zM_%ozh#j&BOZ~ZN5vh^uKzJ!S%6l;uSSiGq5EJn73}|Q8 z@t{|&gc+vroT$d z+&bFWEcS%fO5%^Qgh>8-miQ-pJzm3c5VHc|r*&J>n=34%P}@o&lfOn#6~3*um8FE( zeD;xi#3w#XD?7@GP|5lxH7)>rpk!1lRm2|vMD1e6p^71?z?M0dzP}ul6%_tZu&#fk zL#%z;ix=DB)8h8ERa`6!m+P!Q9Qgo}JUX{5F7qXj!Z=Ad2*vjz(3bug=lMOgrji>G z_MS?jqWxz$wYrRzpo!fDX#d`rdwniclbbt{S~}0yRjYq?hWYie&Sxp5^hNCJc*zyY zi^)DjAIf(kvDwO}b1F&w%q=aSm8jjIUp>x?9{*zHtp|6aakP&IH^LL&GKNV*t=-i(>Fr@F43{yWhDcb~%@>bv zU3OS{DwNPLBT4>lDITrd+wt~sb8(*`b~(lRc5@+d>H#N*pKIplTM6aP`WzZdz=K0e zeTUw*-381GEobz4O_`n0uqEfQCs5iE_w8iW(#DHvV52QcZmVF_H1h|rO2^iE#Vk!}yBtFcwD4?8G5;xJ`-@H%3zQ1LRo8HXJyT;#y+vJrp23%bcdncdpe1OsZp zB1eWw)grBWjJA9riZAtYSfmXi{fzFM6%2$hx{&sNv?qUTeEG0YG%>1Pz$L7x9US*1 zG}syBx8jk=@mEqRn?=UA5^b{}5skC(k86d4f*_m_);oX?%IgfY;#MxN?}GX03r_YF zUw9&ekwW#}f@z?-zbPi~?wi1MIk4zLATbJS)M=jP=I>$d-+4L0nEzFi{@D!vA=kLt zD*Yv;{aK6EY>_`6UsxVBwu;~Qz@OaxtfV88eHf_M$snkXo2-pW!$@)y!M4w(w%_bS z#f@0}$t=9>=XzY9?oGgyeGT$rVq-HvreXf8>-du|iSCG{T+!D)>$=0vUOVECRaHW= z5(o&YvVHb+uR?PG>Z5!@ATdkTDDB-8EmXqs?tG&e_!#pM7IQB}k7@V^LGXuC+m)3FE|8u{Ib z17$GCoZWFg4|XlfE?K261-)>o^fyx?;cI0e%0uri-WVi{5YYGjr~lL_`NBL7G1|vT z_f^TDoV5}bbZuY}mwT13zTA$~zKZVEw;%g*f1(G+NTi2mGS+^x&RK<*_$Vv@2+l&4AK?;DHhgVzxGJesNyvpCV{6N+8CFajswyNR_eFRd5r$U@w>i zwO}?p(WTWD&w-f$POP{q9a4&~n>kcMMbr3M9Ga>>yGaFQ7K>p~4qsNk*E{)y6;Vj4 zpp}zVApw02ixemNwT!ASCsdoEe4{nLM$2;u5?)9dB0y(WXMYqQ1f;r^5G%;z3P9IT ziRMZzvhNfUk+{fz=G3mrYj{@=LNtGxZii{Dq*Ymp{9m)j=NJgfHxqJln3Ric3-wh7t=FTIPiFcgM%vxOx-~ zP$Gral}5mWm2Rk=sf0sDMk{ zsErQ@kA_8}tWc{errnlmE1tNj_Hp#tjYAyfpAfssui;&LDxO-)s%r6XZhn2L?2ux7 z#Nuv~%*0ZEf|#T2)3fQ)pIq17uj?v!?;`7OEQk1-POcFV{-VjVgcAZw_8mwRvkBOUcpMsv%C{1x3nDRtwy zu*nwHz*f`t>-}sj;iSFEpiw2rhK1ZBDtNSfC$;p)OpkoaWv6v8pe~a5&ccVP_G^oy zF@Z7H*)XV;rp z;^Wt55izkdCtYuQYvEk7cAsOoPy01a z5<<<{RPuRUas_R2`cHw(N6J-mt85q36$`2&;TM3W77({AzuE&q6UkDc_{--bUcVua zKU(N@7YWQG0+sVf#q1t@U4%hbn&gWW$SOu=XW3LCBi26yO0$d-xWb-5z+X+j_eADFYo5kalTDEjmN5@sbPt*)hoT-LW`;t3UMlXrnmF~J9Xf;ge zT7lfatAOM?Bv(^K>!J^HjgB;sSe*67U&9yLh}a1EUFT)lSZG8fju zv-mbpW%eIP!BvR)DX~j3=s~F3K|I*q1EL_8d-d>GpOHvUZ{IbuPQwHm5ieHE62g-e zu39`G+@|O+fUU3u2TJ3>9vB(d zzfcB#Kdh{Q*_RMv79EymD-8a6b z!iWdr6;wj$54b1ilF6Unqx*fjR-thS?qAB6AcN7hdxgNjrk6vzoj@mHCM zk4~rGn#E+66Ck7p1gpvu`EwX+>9at3MlJUza0rAzj9OeFFZ0LhN`0MG&3-xAcBj>s zf5IbjZ99W)GLDEhFXM>ccfrv9zS4e3dny*jR_rGUbpYz)R0=N^~k(L>qZ^)<_U0hzA#*g z^H(ydo@#+!Wyqlqj*d&XKLF~SKZz`(dmA(lAyt+t_fb5r$f!XR(nvyUDP;l0Q0M}KC!n=shV?AzFEOH%1s0}c zckjl=IkOSs{u$@YvZ9m%%Ev+^gA1jrZ$_0WjIk}0M3zpKFFqQ5awxD`1x&)H&AJ}% zJ_sUS$YK`K2x+-5tp^&$IM&vX?OnGPzWF>a$PaHdRATAHvC=9Dz+M&>zD?Dh8Rm7m z`q&2)j{QK>n|7g-0HGPV&KROf$%VzwXLIf9d|m|wd^%UUEzJ|Us*jS~Mc*OMFM@gq zD$79kL$gjo+9GIXm)daYPnpg5(Gn58PTwj4;L_{If}R)QE*3o0?au*7J= z-w12$&}=Ic3{8xp_5sA4wUS1M#pw*-jBNVH4=&$=`$ zVB&v-PbBWk+)4G3q@tgBd~ZT=9@@%C8!@0X+K_0uaHwfQ(a*l)gxuQ1J0~djn1i$I zh@rBxW9y*{D)G4y z04A_8ERg)G3MAiZLA(1;Ev0ohGtW@shCzZ@sxiqD+Q*dD?DbLcVXzA8mg;d`Lk-Fr zJGV=Wn;Xa(hNx;*X-{Uxui;Z_@BA>QI&cXbeFw4PVT9hsO7<2~lBNh^djCTRQqaHj znx0^_0D3j}2j7HlXc-=XgQz%;-uwo_h`cgkv4DKo&$ZGXQ)g9{pnG(#-Uirc!$%Fq zpC@^Ug@x9SANqwp$>j$&#>w6+SvKwlLsuxMCGEG@d>0I^aT=odf8%h`MjB0#jnbN? zLNMM|R{m7b34gxm)pui3l?O&lB*ce8OBsM2y2SugHIB&X%cE1@4-UmsDD8mu<*PU# z0m3CP@hSo8lJc5DZUtqvk0MvaREEi#Atfm$QPTAIlk^QdMn$dgMadt!-vhhsmpJWj zG=sT}(l{hF(JAOds2%{0@B(SjN}7Evl?cuka>A8AE+C$y#AyON;XO*}k!m5q@=rJL zzvtERK27{{yDWZx+SVA{0R{@QvV&VF%QsST4kgkS%Z%IG)%_tD&&C~qx(Mf%YM;i+ zbMTYRw6t8p`YPJZih7(ok-mG?W!nPRdvy8veCq4+C2NUcnd3}umZ9)osse-wgw_(^ z`d78fV)uMy?>_Vq%<>cb@@-$Gh6-3)Zt{X(+D9S=O4=yy)+v-QRx2Eyp%3DnK8r$2 z9f^V=2-e6y@h#+hjats}d8HD=71GN(Xt8I9#s8}0F+f-rc5-i{)B6N=EuvQO%I(3$ z$F*Xh@D8%Mb8N`NyJXS4@J1ZPMlaTJE4z}EX+kxVzH{$OKQ3smOI!^OTIo}`v|esL z*5vF+$Uz5eqIw_Vw3y6!6+QDu@c>j%o2}@I82OxtslE;eE|xfQO`;Z>9SyPCGfHim zR*W9A%Z_k7<`H&v%GSXpn8X<&7bHc2pFxO+?HXF z%W_5!0HX$Zf1z5cci}-hYj3wkS_@B=+QFWLRgiu$Ayj;bgV>x0y&4)}9ee`rL6!G| z&h@+)F1`okv;dS>`M*BylZ2J4hy<>ibgo-PB`a|%tPA>4ifcu}{^+k5Q@Srg3aq4=4%fc7+Ruufh9>VJ8|Cme!~3khhaI;l@2sV# z3n{!j%-FfU$Fd`y%-zS~!NlUW(Jj?WYK_cb2pZ+@aZ73njq)_HYZXM!t|r-5gw1%N zP;j4;J6HJ?HI3rZHoV%>!J?mjGx{F*awfl?dzL+m@;%E@mub_GA|Lgl%2vwb4+0`% z(gaZ|zhaDU)jFCKwWbhPh>W6qhZ*he{#BtI_r)JBdjX@>y9Cdp)K+=1h}Q7m)pQo7 zo)gZ37THrQTa>SoY+=XhD%LfjU-^vESVYWRf5#R+Wwre&X|WB0dbCIR#ph3}aS;`q zI#?#F$^qh*;P0>|861fFMV{4^efA}kZ&b3B{;50dYh7#0r1Jhj9e6S->xH-E7%y$_8y%}MF>u`PT`mL;TO>R z&Sm8tqRDo4GN!X?j-s6lHmKV2@MIac`NYLzK2Gc3<%tY*37Nbigcu~unq(zjtHjwTd`(eW);}9* z{+m2-gg@_d>zxFJS1P5 zIE#p*WypcZ7Rzmp6Q;K;T^yzls%l%%a&?hRYFo?YX;t-3Q^WFPZO+zCj0UbgAxIL|B4qJ+MOe85_3-85 zqFFNjEH2+Vx~uGFJr3Gdh$A%{xK&c{VnTu4;E(0ucN5bP$%`H)fQ^plzXeA99UI}* zH(Cto#UZLxp8!a%n}51mo&htAi_91>x%owixS!Qe``?F%L{%}ey zWNeko-5?qxTYIS*w=CaZ7_x%DSnZq<%oD*|K8y}n!CxhtrdS;Pq4{&oi-4QUdS})! zjyGx9R81zj8IN68LAzcch8&8^JJP#7x|U!!+MDY25Ou;O7bEyyLaIsRy0PJ9@jr{h zU2zDZ(bD8}lmmT~Dt1grw_d6zoh<^K2_cQBeztruq_>bmc{8`X8^bk=u>yxnnQn3P z$|&CEhGT$%0@{H>>Zs95SQWLOjt42GU4+0(Z@ZY`PdNC_9le8@*K zNce%UQSqVTYGze+(L??43g(Y`)$s)0w8`j}Nv+qb?2vP0!j}4`u>#k~7RlpgqOy)D zQ3}dt#OPhUoQj?cKX8ZqLHb|@WzB;r094|!PtDc7mHijI! ztjoy45ab_wBEZv01Wnr(;LPa!WBluBu&9KifT88U%74}StLH`5EhizxXD!z8B+BOV zi$(Ll8te_U@jP;9LVXBTO5o^VlXT{Y6rfyn-@Zwuj9=mwY|;{ebQQAbNEdysJ&oPM zO850+(RehhU0D~EMh4Vddq>DafY2DMai!G&%)DK1+CG2$dh;aaQhW%o0TR{_9% z_@KrA9Xh?{#moJ?LpgTJvWN9(<#U>f=nyYlVF{NjsT>*awANZQp zF$86Q(N=RE9~iu}tz3+kl^}@gHp3^~$*D|z%^rKi_1)8Zv1mzwA>+PT1~!onE>LeG;M*zc?rJ$sVXIaVw% zhN2>32UT<*We^fBaP=QTyoi*Ogyv%z;QX>-^8~e?&`oR-Jj+MaBUQ@(0BQoH44X zBXkkzX2{22Mq@{ULRRd;j{Zn!mnP3J~C?2iS>r)8Twev7rBodA0 zTlrL(QeNSC*Z(P}M?abVDEOGE5M)=?HouiYymsX5U zTB;^SD^K(5$6Wr+9+j3iwS63|9=bo;w!RALG1`dL&!Cu_PX>;y}|-vrLRd=>>Fpji|L#SjLqYp6Kx{un1C zM17K83qV){y^sf5F^j_299QKNVEn*e`dz5#MZDZ$Ub)hlJD|GSnd`PN({5p=i@0*o zqftP}sFdA{lwHkFtVzkVNzO!_gng-mp@zpqa$X_lnnhB%i3l z4LP<>#Rj=8k9mzg9b5!z)2&1X_1U;#NJ-KzYThnlI=1+=>>Yn2avQeK$K*SaNm}0+ zNY(daJ*-{;zV+WXOhH^rHk=OSz*S!b7S1AS0w1&bjfeBM5c69A zNgFETH9t_{d>brVrjN;+#CRRK)s;XmICHnf{gzu}#dxzli_U)kPjz75+02QJ(Oy`G zy$jgocyg~>-fR&%6g0|XzFk7Xl*N1P1&#u4PIJh{=KCz%`+X!* z=ax+NR(qK0d~kS%mOqQK*&g(Gx)l%G5(Z^$@cyabGubX>eVpyX&4qwqlI{K-@E%zM zb&%G-bI<*)!!7As^SX3JN`+pzs#ls+ch_((VMQDM6bowS6?N?AE2B!Tf{#-tI(RfI zV3OQ{Oj>OrnmfRLc|AV{XpY}@_`C%Sia@Fz0RtTFdo)_3aX_1ez*w6t&D!<8VSrm2 z=@KU4%fnHv`^cWLCe_U+C*z=8R&hPj7snyM|2eU)(Kac38kp91pvN$&+!Ba;)qt-l z20^_C6yMI@tZ^MQprA-AWDTvP!JpVf#FS9e#{Q9-alV+dc=2(@t@!IZ!gPj zo34DmSj$^_e?eyW8<rbLLi*P3K!$OidYEoH71 ` zL+kxg`1t#{dCMJ|T5bHTS2@o~7@z~^(*?u)6wJZ8fgyG|w93l3RF%I`lL7+)_4Yxs zv}5xAX^iP|Ct#x27l>ZKpl*=gsG7sf=4NR~Y?f8xez;Lcl2-9JF*I5~invya4~dRy z)4LGD`C@G&4>b(j3dK7~)vdbrR@~0ohfddMHm-7GO$tBAXx4PHRnsU|78!;hbE1r$ z^S6fi$&0PdtCKzk{Rkb-JTYp-?-6a4;q%%|o~s zotS8N&OPcx`a&@XpF+k}@4ZtKi!_Q?C(WQxvIa@r2hE<+mt+D+VIX?@;q=(Y#1Zx5 zrx8hOYwCr=rSHg71&tC3T4>Z=WMZda`G3$6I>i2B6+~(omY~~rq(SnnA4UN%4?w|g z`w6*nV|U?S<-EE`eY=T^5@$OyNTp`U{SGA^<17e*#dit@p!Ayg|Iif5Aml3hTfz0k!fW&@?+BKz1%0 zzNr?&+e)rQzGKT zOFb&X=+MUp!#H3_yn@Pjl+^#wegKGr7XGtQh^vbKZB*4&1-2bg0@f()H*=!9#}P;} ztqd5#xEurmwa-fg75Z%FHri-QtOI98DBfQLRnc+aDHGdDsI|@XuPdkjyH#^<6td#S z*)xtN3q`B!dst8{v^2*b7)hS=nXbc061@ha!UBlXZ^-VUlbiAurb6+jr02HVF5LhX)0^4iCO*35#= zC>kISK*S0|CfHH*dJ#JKK{^ z*DBV(MVaJ;V16o%M0t{3fulZ{edhp0MKO(_{ZDwnEs4O2L82|8%$H0fof*rtmw|={ zkk-xP#JYEr-fqmbbMk*KdOHL9i#MZ&dMXiB9vEBnpwNceUBq08rrb8VVyIdRd88zT zH~h~eo^j8MoOLxy{Bd{i<2Og8$1$B*A?Dw=lQ#|jaqE4d>E;GuXy&D|$hgse$+pa){`yZziuk_n!EUDZRkDW*m1TnM1V*_7lWP%z+1wDyJ7zp}JEOF|YVN=sW2$OSy)w$NhH;TeC$oJO> z8XUPdYVWBXSN)Z#j8(BUZ6by}}S?T6?2;Etcp{167+-r%`;qG4^0vVu0flrFX9RCFs z(P&oBRz#M>vKuanx?xYT>mRc(+H}^IIu5JasC8Y^I#1hIIXf@FgJm1st>+Nb?S!n9 zyOh|lP3eN_y_fbn&~Sq3$0mQvHT-tR@)YYj!Fs;cx-z}c_BB6yufOOuHEos6JbA7^ z=pCXt;x*^}k~nj}{u*1EjH|%4L%G1M#l34hE4<)A{hF=S@!53v`K-dUv-B-N)?hAP z8_PE;N<&E#+TBx{9)9FN4_<51d_F!~viI7NvCgT~H`&H}-&xLQg|qEY>pnd`Yw)ZV z$D@@Y4n}_wtlyU(TVPxQW0l7Fb_6E1ZvV_h_16 zuB?N?MyFTq`n^4cb)R>3dF|3IL*Rdr$IRw3Z7Z)q18cK@ci`gX zYxBW-Xwg)M@9TKD?P`?onnS}!aSbjCXFAO2N8}@3byn8JuM9Y}_3T|jld!wILl?*w zfjvw+A-Cy=ZTJWujjiM&Q>T1KyJs)ot(D8o#>&aXsLI93s5#TA3{KC(wM8b~CT8D_ z^;c`)Sqalc(b|;HlFMTolQo;IT$A=7$CrDiO@=8DXb9Lr-AVcZ^m^GyL34mHJ+#?h zW727?)3W=d9^b^e%6LvnHnwz@aW!=uH3cPkEiHL9EoV+rORh85=LAjvE70169M?FYCH`iZ{4ev&5JKM;**f;J~9Ff65tv8)61?Su! zg%>y6qWVD@;xJON2=|YQdK1p z%IeD6o}{_VVjCOlOjoD0&G>vTso%BP;UCXi9@Cnx;g)lIVyXz2ke-plUPK&yIN(0|k3i=WqO=VrvEzi%bPL9p2`IhDvH=PPQ4#tD)tdCZK zQ(3mWTR*K@r&pXh+inEe*jFcMsD05)Z2vyVzDrGKT0Vi-=K|RO-CgiDt^3GiUi`?k z&N5u%5M1>Br1I3DuL2kReWc#_b{(OlJ|(-Y739hdZ@aQ-`EU^`*GQ9ovy+ew8_`$} zY(E2zOr=@t^3s?^9eKH_6WqBT?HQnOz4xO{d^PtccspBDvNJxBsVWF4@UT1aC*hwC%Y-H31-tO0<+9RIdn0-w`V2Sv<|2dVBBR5U^9`!agU*SXIuZdA48)(G;6Fe6 zsQ?S>=?&cS-CrZ?^Yim=xb+PU0&b-%o*o{>4n>V^ZE3B9(o#||X71Idlj$5>{~jM7 zjpO3tkdcu$b^K;#W@2Juc6Riu5peT=7@VX*{JlJ!Sz6dL5~# zX+li%Wm`3z}#9`0MG4s;AD5b#0e@&^)@pLqjIe# z*cY$}1b=$DGtwkAb7!1L9td_v__!JXgPG#~_3M|1hX(<;5H$g3n5CWm@zur2j~;GD zMrLkKMtb)34ZisWzIhJzSr%sKb4;+Pd$_23_}}g;W3SYRQL@tT|2zpE0{ciHUIkvB z?jN0)(w2GdIn``t-*D>@A+~=fl0PsVO;FVSu-n z7Y?H4THR?GnsFdyf(Xq`7{2U616h-}JihjHcYuPt0>%taBTDV6JQo zoQxctPT|DF+f2Nz**Jp#eK-GoKK(q=k9&ZyTA2^KRn&Ns^%PH zRL!cIEPD0Y?<$Wo^^a_?Dp&ItslEYpzAqWqUI9ggelcoK2$vbrd;u(h2Mdi|ieWve z7jI>zV6*L3s)>-d^yq}_9^&jHt4x;&;$Y@0G~b~}`e3tu@hSeTkZZ($Z}fltd0>{2 zkr`UDc-{H2TK@MEa;-%DI~J$3sw(E8JJxA84!OD5;?a@Kzjn`)6nlGKffr3+wB2E# z6B|^iapIa}VtId)^Q|(+6sg!_E8%U3v~tqC9WwYCFR}= zKE8Nhse8Z3GtCEp?!CKNHJlHjeU?J?-vp(#B8pwR=2MI2VL{xz$7miN5#_G&0~>AH z7T-{Kl^v8eOqA8NoJML^OBcaWk96+=1~=SpkZGUB09R{(xx1^kpR2Qlnb%C?(Q{CqaHF#$ zlh;@H+BlT{qovfE+4y_UP}UC@R`{GkkD&|aVe9cDq4BKf?eSc~_l5Y@9-^+rK6ZaD zx9322TddO&ywthHky6Avv489jN1Rr>*Mf5-C|QZYhud54hu53I;uCP>AR=e;uW>)w z_Ic=NLArWgj(U1S1Kj%Byb7V*038=6Xtot@nqpFjuf z74n{eU$!T-U~M38E1Nu&_r;l3podBZ{<()dvk-co9=knd&at7e-RveEMFK2~PjXOpdbpx&f zk9#y6K7*tw4wliL7W%-ATS40#hmxLmRDAF~D{6WcAxHVb*|GqT1 z4bf%g4vY@@%D|>hc6A#i&Fqxfd%W6D@;me;Z$5XYnPTeiAFG#nm>x*dxRm$c6w{%0 zf1+Mw1jWI>6+B49?uYVZn-NpLi80j#QVRZut-VST9dGLbtk5y4Q6_n2#7c$H7Oq|XN!-=j>ozZ6A?rg?fvdD z3;w5NKCaeX1-64X@I^PgUN>saKGF}dO_LhC|B>SB7ZUp`Of8mJ&Wo*e(bQ@LSrzFA zDhs-1ZZU!HGt+2gRPvhL+76TBt+f+P^kX#?fI3#~Zx{@YenTJH-!{`T66i;(13SR9 zi+E{NDw^E^88^Xe=Wz0S`f)b?-8|G4v|YVv#UADOvxyg|m2q}F2#sb1znJn{*$ni$ zi+XWbggZ@IMf5bUA8+dJ#>QBLScs#q??~PRYefypV(`=dwBV%f&%$`cj$zj|Fo?Z4 zhRtBtyU^MhAZIW0EH8Vw_D|78PAYLq;f~*C9&8_8-Yya$y}L}IIoqyswPHd21E->L zdS~*$s%hXuKf<8;$A{x}b>3bep2>@JH8^gRgj6W6o0qpfe-1pUnI>SU`z+pl6sMuI z?n{&7wXzdVSJ)52!zaCo#k^?jD?0q)pCEb=QoynF5`1(W4Jk9Fq^}6!z^+_-@L-a#{oWFNhvAbC?tGxH+fe^Zl zp3{5PrIbHmO6?0uKm_<|(5QRUPrT&s?BSnA49G<{#2qBmd-IDS?bg@wFm4L@X{7=^ zyaqM_pS2fxrSHItgFhvJuRHliN#A4Kg@UTCup7MU_Wy5{!fi)}btr(Hw7-1?TkS6G zcuVJaWyaBr2Xl>2<$DZVVLFV1tfDX+zL2Sm2E0gE9M$Kym`7tcCK8Bb8@)BOzb;Ja zQH<8WfIYUQHlYF=UR1aSt+Hgj^tkrYzUF(>YaM|G%D|6}zD@1B-=&6`*J6jRlwT=O zct4>AzIAf&yxbXCE6!VWb=F?4?WDwxJ+C3GlO|M0Q)uIhQL_2ixB7(6C)V-&nDF|n z=#K3EA$U3Jnc^SLiwW-d3SJnpTN@=AUKX~?E;damw{^n38J8{j>Pf`4w})w4!z+)e zSzy*}nu*&q@V;^qZ$y~7^pf?!%2~j!j8`FMUOtA8 z@h_F+?~b)q)%XDZer_}l)w7oO13SQ1KDT6(Zc8HnlOKf*?~ob2*sL65hx)APdD)Hb z92b+E=g2a1z$9kDRHpjX$5^}mCZ5&YKbkGj|AqequwEJO859Z{IpUWL4Kd)ZXE8m) zE{U3zTpj4Gy(`x#Bg#1EWa9P>BhDNZXCxLpD6nQeR)BH$?@#ekCa`q{wk zLvu*Y$h!B%>M&RA&}~M6{D!FLXr@rs@LX9AK3&z46Zl*@r!1k0!s1=M>!22QVgl_Z z_U>)iyW_neq)RNL`| z@;XU@N8Elyq;J=-Ha_^03mt7vAcaaW+pavHMsTjbKqJEo=r>!;<;`eg)Ibx*0I$ zCyd(ZZ&F|<#)hExDXi1_&fdzR%fdZ(@PPe`^3XW zqC{4u7UR7s#ghv&>fQnGgEx1*q`e)Wz(g~h z4gB`>A{?Oh9yCWQ>%2H61*SWF{igd^U#y1Rp0qZzvd5J$p8k=REmK zY@J7>w{{a5^!vKy6?^Bkn@V*D>^Ocp4*Z9xVa7Q}ek)*xm|gMy@{`b1=iS~we|3Kj z4?gMO%ldQa!JE8H5%>~&>+gj!d)Y=@r+f2JSmah3H&l;t=D~?-@PQH0eu7hWK-n_t zHh2*&CI4+wf2+ty*zWCOTlIt4cz3-p=6-kELFlh3OJWC){u=ZRGGcSuvtgmN(Cv|!`Q}rCee6>K{I5BLb}8QBkQUJdk&8(o4qc)CSwJQm z(a?O~rNVLk`UR^E?UtB%rWXT|%b=|4MGdHx17etq+|$6flJy-f+Un<+@lQquuW$KY zDP?=33I>2+7HKod{}NWxe+(+Y8vA!*YhX6{lf?$mPZ<#3So*b6z_vg z^KsSHXg+rle!iC7G?W=4PfQ8US9-WKR;T)XU7lS29EaZnj{Ju_guF!= zm?A0V0v!CVI`gd@%ynH$dL5SoX(@QBkdMZBMAWdb*i5OaR>CK+82h{Mk?9LuIH{%< zgfBn?bhlRRyX@94vR*5j3~-?ZHjUSsHd@htGRuEfSQ}iS*mbUEGA~##GJyueF{}Ru zJmnr)P3dytm5t8KptS^aEk8O3!#@Vew5r!4qGrft=nU+VDI{r0RL|orrrew&KPBpW z#gg3i5dXg*n)5!@rUYJSQiRHtXVQq6M+*rw%YP64u&>;4n)UQcv&AKDtWBn9<@Fm2 zq^lFfep3!FqyBG`x$Oky^bTBP3N8L6E=KKKCTD7dQXC!46!Ny1`qvDI0Ctcyu+4|1 zHY%_h(bOuBih+3FMqfie_>;Vy;@tAifYvC=PPx_b{Dt%98_Z#pzr8;lD`;q@y6HEv z^fx1RrE7k0=<8fhF4Lm__x?mV=X~p7ld4sn)=h4Xhlsb#PQWkJ-aKu!lzc1sWUayL znC;IN@=Q|2V>m3+!J?|ui`(WA)-F7Y-oLRkOB;g>I+_sVM7X@>>H^C}wJcQ9HOoH5 zRgqYA&JfgiGT?L5?k~_+I(%2yY0ta&5Mus5qu^ZcU+y?@USi^^7(0`nZItn=gi3Em z>B|ZT^&6Izem8D$T+;ClK0OdT3eUq~`xR2OU2yRE|5@T|CE%$k&V;f1TCJmn%6cHU zWmI2hXfpe*BbWGTUbEI?B0jri>cdy?_6z0Cf>@yaS>5xlu+`9HyKcCEisSnA&@jgr zv)mb&=nt=bAwm*1%F~OyyDJL^OY@ZuKV21@XNZl|Fa^clvS^SM+(P~&{+O+>R{Ho9 z&tFUPWd}uLL8j&e_a6hehn-P~kNqjqp z;!pk=cKnoU`+FCtw4Sus2>`CIZ+Y?KQSrD+)7PcOPRV{^#-{d9UCsSjL*?Kl6+1qT zuU&CgsR$Iror~WM$xapa%3aPcb+J6JQ^ZRzlVbw$P;Q=p3Ix&4&yGNnRz9xVvFTWk(-GXppTGUkVjdo{Ov1n?Bk#fC zDrK7QHc{6eyVmkwLDEk--S+1Vt_^so(l&iC2qPp({ts%?xLKn}R<(jBauBk+sI~W4 zONT8k9%NNmG%n|M_vba1&{HL_^YmCkoDj!GuDYQ`$XczUwlXi`#jP%d@S1Xy`|*i2 z3XaS?RWw0qd87GnRuyk46MB`tZ{vJgA@v&!Bo-MSb+6KQlj-J>w}{fn-sUNNp!#Bf zo3VjaUt0kXrK?Y`gOSsgV%Go!)0jnaY1zT|75wT?Ds7R5Y`nQ4?pgu89k~U;`M)&* ztRg_c2{a{6@~Pcfez`aK`gexk!Q20oZG3TbwP2@Xg#8z!_VTPi|KEVxPDTUu#wWeM z5g#JF<>iw*?H#(Q3~EZ0KqOyog342cxfneYH*82=ltI~b9FQH`8=??d;0DJ>b;l4% zsZV{{az87ezl8dP(2W4%v5XQ{)_;m2w58D2p?|y5Q&;czDp8tZ=3*a=RT_tqc*0dt zN=WGrg5SmYM^Jw5^16A~L)fJ;{t12K{a#wibzYG;gpcXYK2yzTT3)o^*Nh_P)Jd`6+vDiyT24ooYdPlT(TF1wr4}9-Z^0h0D7v zn{xlyqa0HN2DW%5CEql$|Aqx5uY`AsOw;&g-&8GzcloZ1h3nRU`lW}LSJ)o)3vkwB zoYBa=Fzd!moBhSqZdk&{-t&iRm!*lXuFur^(o~G#;X$X@mpZAbtG5#DTG!P#G50R1DA!)_oSea;nh%MLk2CLNU0!d)IUnON3>{%3quzs))W&i}deGifP_3OLi7wIyNc zmzs5}gnl1dk-=4x+HdUtcEL|2l7F?emGoXUsveP*ollfab^`w<#&WVl{<^X8d$K3=!|WAwwfyBEA0yIQN<&FOxhglQ^pl z=^ju36*eB68}{u32}yQ-{`tuXtPmuDgrHLpagugEP zv7+9Gv?f&*+YK}iXYx@NcS6CK$}FAiMMyA491qz zw{S}RHr7?LK%gk(C_9Jo)DSxb12od25bgU zxh^x=+?7wHJcFdTLA{8qDCzlowFQZpq#JAw^jKVB6%Q(01gvn5`mIIDpyc-bn-qf8 zU14kgLiBX~?0 zk=jiBNurdMnEvL#nF%&zg?r|Lnzc+JMAoaCZGBW%5dgARqz(VMqpH ziv?h78vUEUe_Ul;C0->yP9KE3)$l>8QUg-rj7Z>bv zZt%A=3S$pZq<|?RdD6er$e4<1?{3=#bKvo~J290~T&5_vFoCU`*Vn`BLejA(+ywW+_SwH-XY;Z+0-`2A1z#7Vq}-!IiSMl! zb`qdi1_*v{HLq&uu{4%LHV=>Vhmrh4bc$ zx%nQj$jSv^ouC%{v<0{V4JpRtTmFc2u;60u>|#-iccy7(+XSdf5a}aAtusU{M5cep z>vRr%=lplHI*>ew`4#-4@Of%?U}T)T6a(qG#n-&8tM%ZIcpwE*uKd)-;Xbg=(AbriS^?+gop z^2~T{fbN+@Q%wA)8G$Y}Z+EhUczaNy`vB%xH5~+^ufP0De?krVrn`=qvLhYe0b0v+r$&-r3$rN6JNBthdP_I%Ocz zTNR`Vy<>w5C;~?U$6#CaWk2iRpaAM@D$Sn{cX#U0KA~4W`t<-`UmI*tP4DmZ?+@$9 zFze8^0QKyY+uN1J4Y;z6wKepkB9hGG^-qA*PR@K3%V(tGx#W4j<4aEmwcxWeB|WS| zJ#L+%`RQyWI*}kxrs-QC+0Eou#YBoH?^nBMfVDlAW$S|dX%Ss!NPvgCinrn>^fo+yHHx(f*gsV5{^$7GZ3|KtRNmrN}-a!h$(zh zuv`PwrBQzbfEElnd$J3y(=?p9lCPHRlWblBzwdN zVLiO$rOxicy;*?<-Vt(VEH5ut(w9C+)*$98+J$@lf{%-<29m~tL6z1{7+P!VA0L`K zA|0tAx5ujx18fBU<~eRaO=Ya4Vcy1PKK0q^Ct3gcw)49W1JJKNLIyx}e~Q&{0_CAv z4-+fjOM+mnrk_5pf5cCuQky$!CyFAApZPm2fX&@6Vn0{{?YIh-^!;MZ2dzxr#q+Fo zci%P*Q0XA5n?wZICE`4b7iN^`$sRyPR*>Gx6c#!aS^qB?5I~#|IOzMj`g+Nsx^b!d zaJdM4oZXd<7Z8a%4Pi5@ft@x`Ejz+jX5R~sEuAf2G6AI)_L*ZwzYFAWj7z_WdsX_e{R92lv zzk{%om58>?v89!!<3!(zCw>E5VLU|hJT8R&?JgynJ>_mpcAZ{_RQROimJp8<+FNq7 zb;S6FzHj7xlc-JGzM1RDhZHVpMeumf>Ak-&12VlHyMuT@6B(4`dEpLwaoFmWAEWaC z*yrmkr9+MEY?bO5?}W5l)>sn2wOfTf;BdPxt4prAAEVS-Hc$N#QVun#(0pV)X(L&m zgmDr)5`+N=9Lqb+f2FGn2mq{z;=iu5ARn1!IhG)KZRJ7kLi~&R@4WF9Uq9v-u!gtP z4BW%`?zBDGRJA~Kq5;%d)g2xZWLoZe&O3QgR=q!4rE!a+87}!67Z=Cv;7-u&HPz&HFd58v zSDVe3Po{EU>E-1GiVsR*P@S1Ls3XibOm+N>7@wX_pkL(Vj4TMXGB+pP!DLM_#$q)> z9enO9D|_m16)M$uv?#QT+=bc}J~^pUJ>G@ebL8LB+6v0Tc29@foyjl?IQne@Mm2DN zPV#%ZThQdy$UAb_>~@)nXV4$6%nVC3^1B*zm`zb5?#$C0f33!~41k3ZOzQK3^;_1p zHnzrd*MrlP9HFjXE%S$V4~z~kk7VU7!gYoen!qZ@p8G&QA<)5SePC6X6RkB^zk9_Z zT5Zs0OE-7(5wC+D#TdN05^&#^iI+&{vLx@h zP=lDLsAv@KD;xNl4RQ^R=+d=AL}f_KX{!Ac?db0EbNkH9%>B(l6geq9&YEeH==nUovI&ya?}7L0u>A2m3KU+yCq4F>{H?wdeB94P-mm#~ax4>z}0u+$TQWV*BJbEgc=Fsgs3z z-*T(_oTGp@ejgwGVA_xmXJX!)w?`l#vl~@yxxw3|7dzNriO=hCw5_Wx5FB7?YO1G4 z-2QVy>T6D`^w=m>gTuaeh|%}rwkjUz;GVPRo%a`O22covrx1`&IqVg~C)4>;h<7o7MR3zD{l z!u#Ul;-{MfLIQ$c9eoKBD=-1yhT4wF$jI6~PVToNDYlmjxz1Mjh2PFU5D<9YpDtx* zlU(mS-_yv(7ivPvhRn=p@;(CZw>LjNiungipT;OlN=Z4bwTHi#;zAdKcSlpGsHhy) ziC4;>cb@-M=moL%VaIm{d+5b`^nz}l!g?-eVlrXx#BO&f48cwZ-afPQf?CL49o&;R zZ9)KgI5=53*H_qaV@V-T9z~#&l8gF1;#EWh|NGf8iaAIIi(%;J6?GPmO$F99O24;O zgNT_4NZ;#nCX%I_N!$w;0j4y{=y6f-FBF?Y`}8prD4vMdRns=Ll-mR4nk|JEg{hm~ zg;8>?wzKZlqTV!zj5w6zQ8}p-kKd_p8Ca~@i6uVwnSIsQ;@psFLCA7#-2-kfIDQiY zyy||%u#KeU4<0IXny&Pb|ID5xT`SVm*p9;4prLKCF-F>QC?P*yY@Dvot=}S3!9g@p zEZv-B@mFWkfGDbjsWu+5zGb=54Fd7s=oGX+yg%JoGJYl$QIK10^6L3g8@!KDX8#nw zDZ=L%aV$VCNQImiGKk`1%}yjR3qxEjVaGlFrShj%g>3szY52EF^y(6@1id#b0PBmc2)3fQzxY?axE-F&U}pT9ghgqh-dm`cC2`+FK}FpOLe< zTwODm4ODJS0&DbR(u)apHyKTvbwmmEMRtoQc#8R$>`q_^S%UXkJka@VH&UlxRfkqL z>Zy|g74DUeWDhVr*%B<9Fhpp^IA+fL;i0+#r;8ldJFy5slMRI$S0Ba3M@RKqZ1M9Z zl^L2rdsMe2d=}qlDd6_Nz~6v7+vjsAvu=kZVY*A>TK9Pbep+s))wZN_c2%96i^ZQM z?d>F;fSdgZO(|{dd87Ah!uDY}f4bm-@3;(*WlyR4P*kd@ey9m=UH<*oD3XIOE>MWj z-yt|NAluO`P4+Ye_4W0YfLT|3!veTM#(34``%ro1t`2n#sZk6K2{}A)-0ijG77!CF zn4}D{1lUO`9&u;=g2!{uLSu&eTAsxpeSdx5iaCoTxRl>?;S%*aGBV=-a?iuZS5glo zb%#e;BW#*b6syHkUhnqz_4MQw@p8wPSz*c4m)<}H7BusFa75+q?wWH)Pu^{w*@al+ zyA!~IcUBWKGczIMJX`Wweo3QL=5-epI#j$VrS<-S)GPAX%{MMil(r+JTMh3?a`iuE5pBM-h z2zU|D3Udh9+%9swA>2lmDy-=nd~LE^TTT#rGoQ_zekdWQbbB1Z?tTy8lybDq6$)J6 zERv${dXiaO6H0vwT^Kd`&1R&aqAW2P(i@&g%xQlZCXY+;S(b`{N8~K0Y9rt}LzV4Y zI}wTJOAq)_jKcrg<#Ju5<<8@yO%;u7LWuLg+eqvWC0bxK&2aqk>Bd~qtNOPe83}w! z#r62Wu;$p?sdB&d7GFMY0+M~UZ8*{pgR{@_)WFXtCs3j*@C{9&Id> zAe}GaGBNZvhjRt0TVo%G%d;0cfdhm*&zi?N=#sZ*a0X)1nInXMgqx7$5!Cwb)be3l z9qfyv#Ys+af6VE5JpNmR(GS$UMW6+7aRCkZ3D3z<++mhEje)(novVAFbjD#zj;Z}OrA237U*F!|o|cx@*I)KU+~fc?lYn?PP9S0L&Sbbd}FZDne9yxJZFN_~)`yt&@{3;87!VgbScDE}7Y zz7Ut4p$}pKWN{*H`)K~5Q1{(7pmN8x_M+m|)m2c1Dkd3!Lm>E_!v7Zg6gp3aZ6a3y zG&O~JR^~)(oKR8004Msm844k0w(dummay>fz6J<`Zcc`F!%VFIG(%h_A|#B26KwK* zIESE&Ocy4(aGT4HjJi5DcZqr>21H8yy)kueEW{cr!7aS0ZCyvf#FX8bpPR#eF-4J; zm0i4FvMQKvWnkaATQCB@zg)P)D<~)&#^gXCr!N$-=Vnsq(6D_Ks=|9blS3^Em&5=0 zw%h-ucqk+UW;R5n_csz68kvBrImCi$VfV<~9qsMKc!0J+3<`<@NEpbwzp*c~wE<*9 zfRvs7&m%=@$oO7wr?qEZia425qtl9$GiT39z20HvW$%uMw%#dvI5T*2GGE6_e5aRn z)MTl`#=_$0;P5+8jGEEf!Qa0&Az|TaM|RG3Zl)|I;e${JHTUkmW#IdhQE>@{rk0ks zj}HX;^q7nL3c0e-$jQ&MvW_Mu;v5}Wfv}^PqTUzlT|z(tqPD`iy8Y|D7&Um4C+}Bpe zIF*WEXjWDg7ng)v084YNehZB3wD~|{*dK_(nw{4!N$>0=JB*D*13#bCH|u*?A1a6c zK}U~dw+$Aa%ISW)w?h&acs@9u`@6RXfdw{Y6&1fn_T$+qT>dzCc%K+T#Nw>ExwITU zr_bA)eQ&K!8zIC%%oJnc>{zKv)CpPqxVXE+-nW;BVrryK2DFu54yfFPV(M%F*L(Y( z@^XYyb@;L311elnQnwQVZ_Pf)k_TQw>_b^mT+A#Yg3t4ZDKO9|Rg$654F&b)U?L~9 zDM7msOy#pP#B+;qDzO2X7yZ0HaRNL#K1LI<7G*{L236^DGC#rpxyI3r;b{ua=LumL zRL1ycBRr5O4*cLR@sgOAhyLfg%kISmW;ALtg2Hczn7wR<6!u%0n+MSuw!7OL&6H8v zlk(a_ee43<9Ya_+J3CvvlM0GS$ZC=-O#$8!4F(wjl@o0n9ih$ZV%=_`ZdwN!Vgq_H z$3``Eb?CjqHvs_w%3SvjXu$=jW(sp0KD^oR7kGnP3|P;zW$DwJ)vBi6pdhFQciQfk zbD_nA7i=CJ65~0@QZK}!h5SoPBHO1=QuK?QKZ@;kOc2b^H+nEF6~+8_w53ledNDr; ze(qXQ&j=eq!@v*)+%n==AEkcO0LmQ3Mn#FoRI2(~i#r895D*fE;*{`llb~A%YHH%m zSi_rMS~X|a;fa5Ar}5_BP0AGZ;dGbeXJkZb2|Dg)*UW>o&X^9Oo% z@exL{8g?QHP4@@Bofu7z4-LuS={1@opShLpY;OZuMr2o9cx@JnhSg}=CoK`%k@Ear z?j5X$d`j8u)Mx|)Pv&d+QLRairQ<41xBi5WRD9xr2=8gnb~HExeJw4Ii**qq)dQqn zG$Jx`^7r$Oi(+PdNJMG`TEcDXZES3;RK$Llb%MFgn*^VHS66EBd9NtUnX}^KSA;0TfDZP}KpWbY{_XVluYYzVTBo-sn_}gK>m})Nj z{siC9MhB_Bo;qjXx_v(AQ_jhopU7VYR4TG&M=e_g7)=2*Qf}7)Ao-e52mjQMD%J%X zE%dv&;-xTZy1^$+9Ie>}Eh_t^&h9owHfrHhU|$#i?dE%zi4@}f@sh{L=6lM9PYIRXr5}`R{u-#P6pf8&-TT7?9VSg zd*&5x^aie9$ONI|W=`~}5GEMnBPntBi%LjHZ7E|*8gReI5Kh{!HA1%#p_7b|-r6<; zCXV--qv>J@XXmu3f113Ja36af-p$pd7YSBMZWMntrI0SdRF+{`#=p4~rNzZ1XZcDZ z7ZbM8fgXNHd?-O!^n{3c>O=HFz~uE1`px&@blPHe#N?157ki{+E`u$PRAkJ=A84B! z(;Mza#cukT7!3Pd7`1N^%xMVAB^u!BW)h>>XlW5J@!M4qnx#syHK z0b(C7xB3sAGZBe%VN44`o$pWeXn{5SvOu(%_@)RlBcqK!lBRaeT&$~-(g>_6r(-0ckw*Q+3}n#(WouoD{}KQuKq zrjUzaON7I3VZAdHM`}Wv$K3}$480kB`OFEaNB^Ja_3xGBRbX7>&$=JpH`VxagYZd3 ze9^94Ei7?VFQEzK>sB13nB+*JUiSmr(l;n5O)c%|PTlWzdN$BhrkcBXdx`|yv)arC z+*5@P)0HG%_G?(+Lud?aR5s*@)7V|QU^jtK5>7j9)=@cSB!RYyQud1Oa=*cu@P~dU zsi$QvMO0U=Xw=45NBsp&Blyv@zc6l1U-~arR|z{m*cK_T&4_!9oovJRzY28Yd%~{f zT7BN=iGgFfzVSSW%MM#)wEh)cK2yAUXQxwW3#a9lmj(LXNlx$w;n2yZS~|Y*r{&zW zbeu<;AWu}Vu8sk(+cKXuCzCWb-e;IB>kz||F!af$ZU$bi>zM1Nka=cV7B!eW#tsKJ z;8~82aZtx!B_hnH-0@TΞkn64&R?NGRF2wTYjxOH}sGwqPd0eX@dYtV&pkf<9{l zzMS^URZ5EWPA+wmTid%zj_;$}sojr{(zMnYN#ZlMxUI-&22b`7dcs-Mv&;wLVgD@`ZAjb4y=70 zU=pW73-U1vd?JS2yF)MNaP>!E0(}lk_m}s3KjMRJdI?cxInS?=uCd?LyxQL>vM9GIVFvKi($muV z$lEz7%paWsUr687R6n4I8}E&!Q5Gj6wkpZX_Z10woipn;KzUIEZLmC6$;6ZUa=-h} z2vwKU;SD;N=pWZJ%LW z0;!{>ZD3N5`*9!`9~c-|DSgr#CXvwBE^#_--VP0?q{Woh;4>JFPdR?5+6<2$JR~K+ zUH`Qe;~R-T49)bcS9SKfv4HYMyyQ z-F&*cKZ9lTtw6n|icRr&eRQcT20Q>%v%|nacM6pE{1`Dxx^Jw^)l3L zbEaaFCY|QKbDV>TeDod0!-vfl0D zmZ-2s<^n#ioW@vd_k4Cd-q2);tF12QsmZV^W!MqBe{!o(A;aW2oP%*O?ly>toqpg zbK0l$+cOZX3AlSrSU0pzt3r#1Z|-cdTY!~UpF2nGV#P_BjMsg$GO0fq$E-+p|{uIONODMcz0t#_IjR58Dx1wVdI%-JSJtEh;gk|%u3Zm(7mUoROEKokj;SHXU_ zQeWv4_+qPI=x73$s+#9&MffJvHzJzJ0_^S=!Z;gSpLZZYtiU$ z0b*Q+*x+kb(Eu?zNEYoPY;Cp0?GpgXMlL&6zu7{4Q&=zqdJV?6&=x|i&4OZVf2FoK zO&_rsukNmbUe|E=f#3tQ zfL2-=KelWFS*Onq3ff|`%ZJ#{F+iKMw|DRk1_qd~*+I^PXROZ>Q?5A!92ylOZz>+# zgO7)&hJ$1oBUR`u;(LDziJ+p$^5lBVN0T!%iAGj?0s|o-5rk#hu!GuIzy3nw<`)+B zTWxcpk%_&$yo7{^1J;Ps{uf;j2xvr?2V!6!2wNH_8!w0eZXa(!s`x1Ix`5B02Ow%& zobsd4VBva;osPV`yp|R|e_*TQnxW4gexl5!n{`S2)nTfyuA=-4Brjd$H(RNW%b-%K zxGM2mA}A?{-1*wD3-ETN8Yl?qVRih%J_L#VS5_e1c`vU|H=zjV#~VFhNE*2?_n8gu zWJu&M2!y+=_5w*YA5n)-iN1`lt>w*_Eh6~nV(`Z68w*(caespS0aFsO|I=)EGLr*V zSxQwh`B`G~>;fYZ5+4_0U?&#ZeNw~H+RR6Y08h(O6iF8Zz6%JFSq={l<_Y`YOvoNQ zoGds!oUi>WF86zKwXvzHufKTM2uuKlv6jV?N}L{rA(2U%O$S#-y&mqfb8+2F@NJwf zx12zd9ETC%hK7b~r)|z#l8%n&(fFUlXNyb@fkk()RL^iJAi_(?_-WkDV!g;0g6)|dN={2P*N z5*FT1v^FGJkSQ866wvbpx0Ca9nR>l*ky z%sR=KXe|_qGj2lJ9TTR#4>~pytp(Z?shMD%xlqpE47SCn%C<B zXWxj4KzbZPt_f{N1w}WDNYz3hzz;_ZcicE_xuJm-SqSIB^D(E2t`b}J$R1 zb>Q>&Oh7dvUrF6#lz+64xYsX+`-zp62y%-C{^*~|3m&1opf?zCuaA5Eivw%HthE&@ zv)C`1&NUy@6b*p5FP;9T59A5aho7O2F@kLAJu{$*PCM6>MZi17SCGt8WO%qlQzwPF z7JjN4+pRRbn~2?8fMh2vWNj3H1ya_l|EEh z`tw1gNM**n-07#Ys*@Mqdf#TmL@82f1|cX1qd!N6P!I)eUDby{q$C2LNU?T=+fnhw z7B(7qabq|C16rrz8wm%AA|HPc`EoIDN?{lY<%LfbNWCA3>>zYM9eqfE=`M}%lPyFX zqiYSs_aU)Vh4?5RV-_s7W)XA`WjY(OPl;$@BRy~q3o}3S#hvMM&Yy}753c{!p`6}H zk3CDqc(L?N)3m(YzjVb9_?u*=PJE^*m$)r`&e{~P^!X(|BBd$dCj_=8Pf+TSYU#fU z@vg{Om^H!87jZe`*r?fkZI;_I1(-?2 zAPCd~Fa#9hB-r%PLI%z0T9lcZgkK6~e>F>D&fvGiiQW(hD<~QKa`dvB?dgeb*|D#Ui<#HZN4tEiX{(E>NU3MIf%L{5=LgC1yft8H2k%oO+OV__no40b zPv8GR*exL_b-^R}BEML;5BGsVl3`cftm{jFK}aTf+8fHIRSm)$l)OIM56P|xTM^nE zeGfUBooLWc?qbvCC~=zA&5Gro~RJm#Xei2PtQetu>NF+Q8S zhTP-fCzJ$0n4fi_mG*k&ML`^k0|&Z{AqM2~kz6vk6tV$|5c7?Xg&%tPVa`hJ8b12K za+HXDu52Y7QIBC>B1Kq2&JJd#wkzU2+J^X_Q2K~vNe&J#>TyXR zLwj(e(is5p&eZDFa53n#1m;t=O*k)P!Ja{Nz7-Hd%DN@*y@ zP^NMkW6OrXi-XQL5tu#5CCk@=%v)za(gH%tn_#2-$3*DXQ|CE5d$Gu2pUb~@OIZ6l z4Ade1=Y_3;A?!j5|3o9)rp?6`X&bjVfXzZR2l%N++mhQdt;&LLZH`z53CV(aJn15d7Yy0~S@$eGCE zjiiPQ{Owuq)t#dbad<%%w#nW&a?u* zq3r?|<^Au6&OC3B(>opn+uRRLfioZX++#$a;q&ABe$heQ(zta%ac=PT0*0#lp zSUJ_LdMui`B>HaJ4n#v*6ZY$zjXL**HhM4K2m~jLj`*FjOyv`&#^NS#v<0WWo^w7F z#PxXQ>~z_8y@c?9{0OOsJ^IG70ukR;E*zyKtHjYLaFXJKKXrLAq*iaR(s$cUkZ zjEt-nPBzdi$XLqx`}glkJmlHrtgP>F+nPd^XgFuJ8C!!;tZ3iLqGC=fYicyAG`-h_ z2Q`BJ_sD+K+vTxGBGJ?>Iw5ksr$FnQMe}@)d`q8x9Y>8lEmI9`iNX_m1v76wby0&L zH4j)qM%`T-IpEi)LQPFemu~@dW7ORlxuhu?>H71R{61vTNiRp?iLxXd(KOAi>%-v# z#UzY9DVgYsYS>DWkHmLPm+n+mZNI*J;?0t;oz}LZMv_~!f?j-AwQh_gkT4`HY)S%j zm2%?JsVarxY4ebT-(P_tbrx!GkUXiR5Mz!c%r1(HvNC%7{@xxP14F29Xme|GGi6h0 zY3U-y2qe!$*i5l`&uD01fbLL zt|rc>5q5kRGYFH!oi%}en%05;U;Q=}Ll_0lndoDfGKh}Ap_sZW_Zu`k)no$QZM|&Pcuby6U9Lr@Dyhu+=Z6O=BnhxSZtwA6;J^ zRps_>jUXj0CEX?6-3>~rNF&mMbeA-OG>CMINJ%5zDcvC5-3|NwmgnAc$GyLEzWo=B z0b}oX?WdnP=VHiE*GVzZh|^QU$+n4w@rDVA~ z$8GcFf+6L!L6&+r4#;*Gw@sA(+kniw%lQvHUz#VCk_G$APPINs-6Fw1;bnAnbmVrv zkV{-2TYm03ni>B|xarE&~ zm&7c5^+VTjKar}iWyCJrbc}Ij>Jr7eu!n86vCzcD-=nIA4{ljk(BNu!N--bDM~?-) zLjD5UlwV)1X{m(8g_uN}i|F%zh_DHAvsO^jiL$fSbI{du2bSz~fHV7Y8@H|TOJGOQUHotA4vtkxDXXg-nb+rk zPGK8a-o*&bBb1Uaxys?nWnZuLRwOQla-E~ zgNLw`oXUG2dqpQ}(v{J^@2UQ6bC+#n6^_6{(hQ?xW0I0mwDb)81a{P|y&RO>ouAh~ z^_KeJGmV|{1rIkPTI+?Ln#Lovn5&;>z>^*@$iCz8oBYs1W0!5h6&^}gC|lllaEU!u zIb+2+Co@P*{EOai`k`^>kyOzN2KfV@)Zr$o=!68Yxt*vUsPAn;AT=%~>p0LXE-Q z|BcFhOI24QYuiTjAwu$Q2=O)$=Z17;?^bO|s&8=6CoCWCe;g{VwNc^Du*`_(&e_4-W`DEyxxn>Zn}2QK)!H~7E%kx?!E_Za;#*^is=`r*zy|-j zqn^;Tde{90T=c4-PT7KO%j1uze*$+pTduiTW#xCbicIb;4Eh|(J%ZQDAVPNomYKOLh@fZwhsOkBXO=V}s zZJT$Fc6_mjr>Exn8_O$u+U!Fv_Q|;1j5eo)n=TJB*wdR^h}?~7;2CCUsqJ5JV>HNS zN@7zG{%Q#IXNXL5dLrc2lbMBlNl~uE7w1>+czf`{=bkJWQ}?Hn_Pg+~*Hn&o^RD!l zdkb$*yPosDoM6nGvcz^9wBXq)BCuF}?M9AZ%U6wfosig)gySZODUQ-3>49zs)x2hL z%U&8(-Go-{)n328!c%?lKiQOFwR0KbSag;&3NX=Blwo)rXT(c1kWIN9W%1(0IKO7| z^JgmXoVO?a>{CVBaWSdsiAgD6zP#4cBPAlzR#O9(gik?XsH2k{A1~vb3nqW-YipDb z$^byFwzd|GF-Y4eG;3nBe0NYtNMd|^d_sb(yu38kBPnSB1u$kb z1S?59JC4zB@aNRXA>|bnF<@Vc76cXo!&vJ{E)ob4m`DReu{>vm_4hcgKDq;V|rK&^8Gq~mK!??U(w5$ zzOwoSb|GI&|MAFtip*Nn{w$IECV}nn&P1`Hmk^yWU5K5GMHLQBj5q$1P(@h*3@QR+ z98|f+SKZpQ#-qwVwC0UvK2^$@N@Uj+xuiqz0e#=zk9%FTnyb6!6s4Z8Eyra?#+}{8 z&Y&F_piNBH7>GG9m_3dWoSG6B6Y8Q*X7yLhf4_llr8ta(76rp`LqHo+UqVDMpCp#8 zY8~}A!h4xYgoRq%K}4%}K_Ay6DQAmUC0DUvG&)u<>U;^un4py0vMQgafpc4lR|iqN z*ZW5-E9F*ZLnD|{l(F1|L{tTGh(u&~l57_rrHMJpX(%~JDV@LqkiR}licL;+f$aqu zhcz^=iVbg7iR;j^!owM#w2UAJOR?s|)iij9^pHDznQIMUJ zlL$a;FxG{K_ooTFO27;~_pAOaw{LFz8=G$r>eYDdS31D|1mxXbUxR8!atW+#GSBkz z@?bF7@M(zn-ExX;Q>)<|d-L5%P4m4C7;h7@>7h7a!XNJMi&{_rC~bz_taOHhoftG= z>jRJ$hXCh6OHbeSlyrZlf&mY_8^Osp_=^VE%`W%|!S~%!6wLq!1{O5{t)~9!6;cKM zqs{_pk01_GVtAe$hy*KiKkoPMKK6~7D%2sCQE#cmVd*fb#wO9Ou;Pt*X!8!Y zOpzcZaG7rC4QHIC(YRE7wFG~j8SUrzjQ~j3etlQ=$X(Hfnso8E8ko>TBP>~}0zL)Uopy&RSgA${fw&y*MNNk%EHD5cd z{Qh|$dV~H%z-fLqVnB)4{Bc^^?;4wV-ZyQzj^=D#-Sif^PI3{;dmp*Gu_!8%iWP82 zq*zRbKBqBpqGsO9RT)fWm!W=c6uy*^m3j5@1@YN8eJKti-knJ^9zQmBqvK{@bQDtd zrT8St!wI2qrAq{8%Kub1fO4raIfPgp0 z4p^c2i%^Jq;IWXspr|C>yE&MzUpfM?oZrKc~XU( zAN>+%W=6|PO-%)`<(37>DH<$$2i4 zl_bV?_#15@<8*UV;toqwAzx(*1RY-bR>-m+GQug@5?DJAe23-8p{EO!Tz-p5xg=j( zyGEkDY8e?uu1Sfgx?F8**r?4jr+f6|sag8X+rsY!=?4P49@EbwwDV(};Bm&YXN@LO zJ}!w)(eR4Pwih(t9fXU91n8na*L8U>6^KxstD0BV$wSyxiT?L#&iaRdU7rUw&mPRua4UC%?Ya>yS$im7`8x3t~=f zA^AoB@HPy;!ld%`dB<0pP^$J6N^9;Df}7gm-E-oI;K)b-a4y!ZS$Qw#t%`kn>3gHr zqQA?O4v+r#OSMb66#^lJOw6I<5O#DsCDhymE%^apXJBndeC~K@(^WaZZ2@RZ8kgxK zft13+U9f@O{^SjSPXPbr?{+v>gB1e6KUzhv%gl$;0dBOVMdHXE;9RdD@Xj{?$(Yr6Ui^67H- z?h-QL8XKKvb~oWN#S>%=+(pXQF9M&PC(Zf5GBosbI#L8?7BeL`Ai1>A2f8ojIeee1 zC3Oum&SL7-F4vm`^vPhR*Z%I#f-*dy;{jFr-yU`^sZ$Nql4#Qm)M~6qD$vH}@Aa(W z6XdeFdp-?4$-w2_DSqQk3ToZqW4Sd`{6!I4SqZeM7lkFW_7?9v)Rl6+e(hEBU0qmb zz2&o#!IyvL%Dh2VFh z9X}c-rd$I+CVe9OeaETstzPN(W%>$J%ipgflLovyaWTGwu_2ho3ylVwsX^2@djGP` z^)=*T{br}*Sb$2ivjvd5A7=nwNO0HO&~OJ9sE>l+S>6L<5O{R*DaYvWDvp?p3?thI zB#R$EIDqcyu$$TefKbp34h$f6e~mIf#XbG*2?yfG7BwV^9$*{veeezn3 z;LptZ@xyN8SJD+Kd^5l~AG!aK&jnxywGj}sK=4tk2X;D=BOil6M|nZWZg6&TBII)? z@C^pmA_VLP=;>H=p!PxBj0y{DqY9N2gh@*;6_z&n2Z3!2WMyzyzGQz#F-5Hpu`5ju z%()ip)V#T^o-ce=^t;i!|E7wWuSC}-ZVC$E=>-ZP8|*fO(B?xiLDAwLl`;$RQE?_P ztk|#ZcI9;DYb<(=)wug=L26ObT9NCHJZ}DN>H2|o~7)$eH=zFIhI|b zRqzf?nd=^2gg9Y>fO<>z`OCL?o7Pc#ZBD&rLXNQM#vWbSbwoCD75pkwM(2%{ZO@U* zUwo3(8p9jT4D6|EHeN5h#`6umUg>F#lL zRZMhbI~Z`(<%2r2fn=~sJ_NWmfU&BDYHYAR-cD$WApVV!dXoMLB$>rJd&jSa&YZRHaInPxie9?bdhNXbcx|}