diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 21e53da..2877a6d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ on: jobs: build: runs-on: ubuntu-latest - container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.0 + container: ghcr.io/osgeo/gdal:ubuntu-small-3.12.2 strategy: fail-fast: false matrix: @@ -23,28 +23,33 @@ jobs: run: | apt-get update -qqy apt-get install -y git python3-pip libpq5 libpq-dev shellcheck + - uses: actions/checkout@v4 with: submodules: 'true' + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install gdal[numpy]==3.11.0 + python -m pip install gdal[numpy]==3.12.2 python -m pip install -r requirements.txt + - name: Lint with pylint run: | python3 -m pylint deltap prepare_layers prepare_species usecases utils local + - name: Type checking run: | python3 -m mypy deltap prepare_layers prepare_species usecases utils local + - name: Tests run: | python3 -m pytest ./tests - - name: Script checks - run: | - shellcheck ./scripts/run.sh - shellcheck ./scripts/generate_food_map.sh + + - name: Snakemake format check + run: snakefmt --check workflow/ diff --git a/README.md b/README.md index bffd518..bf6c58d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,89 @@ This repository implements the LIFE extinction risk methodology as published in ## Running the code -The methodology is explained in more detail in [method.md](method.md), but there is also a script set up to just do an entire run of the pipeline in `./scripts/run.sh`. +The methodology is explained in more detail in [method.md](method.md). The pipeline is implemented as a [Snakemake](https://snakemake.readthedocs.io/) workflow in `workflow/`. + +### Prerequisites + +The following tools must be available on `PATH`: + +- `reclaimer` — downloads inputs from Zenodo ([quantifyearth/reclaimer](https://github.com/quantifyearth/reclaimer)) +- `aoh-calc`, `aoh-habitat-process`, `aoh-collate-data`, `aoh-species-richness`, `aoh-endemism`, `aoh-validate-prevalence` — from the `aoh` Python package +- `gdalwarp` — part of GDAL + +The following environment variables must be set: + +| Variable | Purpose | +|----------|---------| +| `DATADIR` | Root directory for all input/output data | +| `DB_HOST`, `DB_NAME`, `DB_USER`, `DB_PASSWORD` | PostgreSQL credentials for IUCN species database | + +For GBIF occurrence validation (optional): + +| Variable | Purpose | +|----------|---------| +| `GBIF_USERNAME`, `GBIF_EMAIL`, `GBIF_PASSWORD` | GBIF account credentials | + +### Running the pipeline + +```bash +# Full pipeline: delta P maps for all scenarios + model validation +snakemake --cores N all + +# Delta P maps only, without validation +snakemake --cores N life + +# Species richness and endemism maps (not included in 'all') +snakemake --cores N summaries + +# Model validation only +snakemake --cores N validation + +# GBIF occurrence validation (expensive — downloads gigabytes of data) +snakemake --cores N occurrence_validation +``` + +Other useful targets for incremental runs: + +```bash +snakemake --cores N prepare # Download and warp all base layers +snakemake --cores N species_data # Extract species data from the database +snakemake --cores N aohs # Generate all AOH rasters and collated CSVs +``` + +### Precious layers + +The habitat and elevation layers are expensive to generate and are treated as **precious**: they will only be rebuilt if their output files are explicitly deleted, even if upstream inputs or code have changed. This applies to: + +- `habitat_layers/current/` — food-enhanced current habitat map at ~1.67km resolution +- `habitat_layers/{scenario}/` — per-scenario fractional habitat maps +- `habitat_layers/pnv/` — potential natural vegetation fractional maps +- `elevation-max.tif`, `elevation-min.tif` — warped elevation layers + +To force a rebuild of any of these, delete the relevant `.sentinel` file (or the output file itself) before re-running. + +### Configuration + +Pipeline parameters are in `config/config.yaml`: + +- `taxa` — taxonomic classes to process (default: AMPHIBIA, AVES, MAMMALIA, REPTILIA) +- `scenarios` — habitat change scenarios to evaluate (default: arable, restore) +- `curve` — extinction curve exponent for delta P (default: `"0.25"`) +- `pixel_scale` — output raster resolution in degrees (default: ~5 arc-seconds) + +### Inspecting the pipeline graph + +The rule graph can be generated with: + +```bash +snakemake --forceall --rulegraph 2>/dev/null | dot -Tsvg > graph.svg +``` + +Note that rules whose inputs are determined by the species-extraction checkpoint (`generate_aoh`, `calculate_delta_p`) will not appear in the static rule graph — this is a known snakemake limitation with checkpoint-based expansion. To see the complete job graph after species data has been extracted: + +```bash +snakemake --forceall --dag 2>/dev/null | unflatten -f -l 5 | dot -Tsvg > dag.svg +``` ## Credits diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000..c260e71 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,46 @@ +# LIFE Pipeline Configuration +# =========================== + +# Taxonomic classes to process +taxa: + - AMPHIBIA + - AVES + - MAMMALIA + - REPTILIA + +# Pixel scale +pixel_scale: 0.016666666666667 + +# Hyde projection data +hyde_projection: 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]' +hyde_pixel_scale: 0.08333333333333333 + +# Scenarios to run +scenarios: + - arable + - restore + +# Z-curve value for delta P calculation +curve: "0.25" + +# Projection for species data extraction +projection: "EPSG:4326" + +# Zenodo configuration for downloading raw habitat +zenodo: + jung_habitat: + zenodo_id: 4058819 + filename: "iucn_habitatclassification_composite_lvl2_ver004.zip" + jung_habitat_updates: + zenodo_id: 4058819 + filename: "lvl2_changemasks_ver004.zip" + jung_pnv: + zenodo_id: 4038749 + filename: "pnv_lvl1_004.zip" + elevation: + zenodo_id: 5719984 + filename: "dem-100m-esri54017.tif" + +# Optional inputs +optional_inputs: + species_overrides: "overrides.csv" diff --git a/deltap/delta_p_scaled.py b/deltap/delta_p_scaled.py index aed4923..3925e93 100644 --- a/deltap/delta_p_scaled.py +++ b/deltap/delta_p_scaled.py @@ -4,67 +4,65 @@ from pathlib import Path import pandas as pd -import yirgacheffe.operators as yo -from yirgacheffe.layers import RasterLayer +import yirgacheffe as yg +from snakemake_argparse_bridge import snakemake_compatible # type: ignore SCALE = 1e6 def delta_p_scaled_area( input_path: Path, diff_area_map_path: Path, - totals_path: Path, + species_totals_path: Path, output_path: Path, ): os.makedirs(output_path.parent, exist_ok=True) per_taxa = [ - RasterLayer.layer_from_file(os.path.join(input_path, x)) - for x in sorted(input_path.glob("*.tif")) + yg.read_raster(x) for x in sorted(input_path.glob("*.tif")) ] if not per_taxa: sys.exit(f"Failed to find any per-taxa maps in {input_path}") - area_restore = RasterLayer.layer_from_file(diff_area_map_path) + species_total_counts = pd.read_csv(species_totals_path) - total_counts = pd.read_csv(totals_path) + diff_area = yg.read_raster(diff_area_map_path) + diff_area_rescaled = yg.where(diff_area < SCALE, float('nan'), diff_area / SCALE) - area_restore_filter = yo.where(area_restore < SCALE, float('nan'), area_restore) / SCALE - - with RasterLayer.empty_raster_layer_like( - area_restore, - filename=output_path, - nodata=float('nan'), - bands=len(per_taxa) + 1 - ) as result: + # Process all species in total + total_species_count = int(species_total_counts[species_total_counts.taxa=="all"]["count"].values[0]) + summed_layer = yg.sum(per_taxa) + all_scaled_filtered_layer = yg.where( + diff_area_rescaled != 0, + ((summed_layer / diff_area_rescaled) * -1.0) / total_species_count, + float('nan') + ) - species_count = int(total_counts[total_counts.taxa=="all"]["count"].values[0]) + bands = [all_scaled_filtered_layer] + labels = ["all"] - result._dataset.GetRasterBand(1).SetDescription("all") # pylint: disable=W0212 - summed_layer = per_taxa[0] - for layer in per_taxa[1:]: - summed_layer = summed_layer + layer + # Now per taxa + for inlayer in per_taxa: + # get the taxa from the filename + _, name = os.path.split(inlayer.name) + taxa = name[:-4] + labels.append(taxa) - scaled_filtered_layer = yo.where( - area_restore_filter != 0, - ((summed_layer / area_restore_filter) * -1.0) / species_count, + taxa_species_count = int(species_total_counts[species_total_counts.taxa==taxa]["count"].values[0]) + scaled_filtered_layer = yg.where( + diff_area_rescaled != 0, + ((inlayer / diff_area_rescaled) * -1.0) / taxa_species_count, float('nan') ) - scaled_filtered_layer.parallel_save(result, band=1) - - for idx, inlayer in enumerate(per_taxa): - assert inlayer.name - _, name = os.path.split(inlayer.name) - taxa = name[:-4] - species_count = int(total_counts[total_counts.taxa==taxa]["count"].values[0]) - result._dataset.GetRasterBand(idx + 2).SetDescription(taxa) # pylint: disable=W0212 - scaled_filtered_layer = yo.where( - area_restore_filter != 0, - ((inlayer / area_restore_filter) * -1.0) / species_count, - float('nan') - ) - scaled_filtered_layer.parallel_save(result, band=idx + 2) + bands.append(scaled_filtered_layer) + yg.to_geotiff(output_path, bands, labels, nodata=float('nan')) +@snakemake_compatible(mapping={ + "input_path": "params.input_dir", + "diff_area_map_path": "input.diffmap", + "totals_path": "input.totals", + "output_path": "output.final", +}) def main() -> None: parser = argparse.ArgumentParser(description="Scale final results for publication.") parser.add_argument( diff --git a/deltap/global_code_residents_pixel.py b/deltap/global_code_residents_pixel.py index 6bb310d..bbae6f6 100644 --- a/deltap/global_code_residents_pixel.py +++ b/deltap/global_code_residents_pixel.py @@ -1,298 +1,332 @@ import argparse +import json import math import os -import shutil import sys -from enum import Enum -from tempfile import TemporaryDirectory -from typing import Union +from pathlib import Path -import geopandas as gpd -import numpy as np -from osgeo import gdal -from yirgacheffe.layers import RasterLayer, ConstantLayer +from snakemake_argparse_bridge import snakemake_compatible # type: ignore + +os.environ['YIRGACHEFFE_BACKEND'] = 'NUMPY' +import yirgacheffe as yg # pylint: disable=C0413 + +# This isn't a hard requirement, but in practice most experiments use 0.25, and the original +# paper used the other three values for comparison. Other values are valid, but to save wasted +# times with typos, we do restrict this to the subset used for the paper. +FLOAT_EXPONENTS = {0.1, 0.25, 0.5, 1.0} GOMPERTZ_A = 2.5 GOMPERTZ_B = -14.5 GOMPERTZ_ALPHA = 1 -class Season(Enum): - RESIDENT = 1 - BREEDING = 2 - NONBREEDING = 3 +def open_layer(filename: Path) -> tuple[yg.YirgacheffeLayer,float]: + """We use this helper function for two reasons: + 1. The delta-p values are quite small, and so we want to ensure things are in float64. + 2. We almost always need the total area, but rather than calculate it we can get that + from the JSON file that sits besides the TIFF. + """ + # The "nan" is an artefact of bouncing the data via pandas + if filename.name == "nan": + return yg.constant(0.0), 0.0 -def gen_gompertz(x: float) -> float: - return math.exp(-math.exp(GOMPERTZ_A + (GOMPERTZ_B * (x ** GOMPERTZ_ALPHA)))) + layer = yg.read_raster(filename) -def numpy_gompertz(x: float) -> float: - return np.exp(-np.exp(GOMPERTZ_A + (GOMPERTZ_B * (x ** GOMPERTZ_ALPHA)))) + json_filename = filename.parent / f"{filename.stem}.json" + with open(json_filename, "r", encoding="utf-8") as f: + data = json.load(f) + total_aoh = data["aoh_total"] -def open_layer_as_float64(filename: str) -> Union[ConstantLayer,RasterLayer]: - if filename == "nan": - return ConstantLayer(0.0) - layer = RasterLayer.layer_from_file(filename) - if layer.datatype == gdal.GDT_Float64: - return layer - layer64 = RasterLayer.empty_raster_layer_like(layer, datatype=gdal.GDT_Float64) - layer.save(layer64) - return layer64 + return layer, total_aoh -def calc_persistence_value(current_aoh: float, historic_aoh: float, exponent_func) -> float: - sp_p = exponent_func(current_aoh / historic_aoh) - sp_p_fix = 1 if sp_p > 1 else sp_p - return sp_p_fix +def calc_persistence_value( + current_aoh: float, + historic_aoh: float, + exponent: str | float +) -> float: + scaled_aoh = current_aoh / historic_aoh + if isinstance(exponent, float): + sp_p = scaled_aoh ** exponent + else: + assert exponent == "gompetz" + sp_p = math.exp(-math.exp(GOMPERTZ_A + (GOMPERTZ_B * (scaled_aoh ** GOMPERTZ_ALPHA)))) + return 1.0 if sp_p > 1.0 else sp_p def process_delta_p( - current: Union[ConstantLayer,RasterLayer], - scenario: Union[ConstantLayer,RasterLayer], + current: yg.YirgacheffeLayer, + scenario: yg.YirgacheffeLayer | float, current_aoh: float, historic_aoh: float, - exponent_func_raster -) -> RasterLayer: - # In theory we could recalc current_aoh, but given we already have it don't duplicate work - # New section added in: Calculating for rasters rather than csv's + exponent: str | float +) -> yg.YirgacheffeLayer: + new_aoh = (current_aoh - current) + scenario - new_p = ((ConstantLayer(current_aoh) - current) + scenario) / historic_aoh - - - const_layer = ConstantLayer(current_aoh) - calc_1 = (const_layer - current) + scenario - new_aoh = RasterLayer.empty_raster_layer_like(current) - calc_1.save(new_aoh) - - calc_2 = (new_aoh / historic_aoh).numpy_apply(exponent_func_raster) - calc_2 = calc_2.numpy_apply(lambda chunk: np.where(chunk > 1, 1, chunk)) - new_p = RasterLayer.empty_raster_layer_like(new_aoh) - calc_2.save(new_p) + scaled_aoh = new_aoh / historic_aoh + if isinstance(exponent, float): + calc_2 = scaled_aoh ** exponent + else: + assert exponent == "gompetz" + calc_2 = yg.exp(-yg.exp(GOMPERTZ_A + (GOMPERTZ_B * (scaled_aoh ** GOMPERTZ_ALPHA)))) + new_p = yg.where(calc_2 > 1, 1, calc_2) return new_p def global_code_residents_pixel_ae( - species_data_path: str, - current_aohs_path: str, - scenario_aohs_path: str, - historic_aohs_path: str, - exponent: str, - output_folder: str, + taxid: str, + season: str, + current_aohs_path: Path, + scenario_aohs_path: Path, + historic_aohs_path: Path, + exponent: str | float, + output_path: Path, ) -> None: - os.makedirs(output_folder, exist_ok=True) - - os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0" - try: - filtered_species_info = gpd.read_file(species_data_path) - except: # pylint:disable=W0702 - sys.exit(f"Failed to read {species_data_path}") - taxid = filtered_species_info.id_no.values[0] - season = Season[filtered_species_info.season.values[0]] + os.makedirs(output_path.parent, exist_ok=True) - try: - exp_val = float(exponent) - z_exponent_func_float = lambda x: np.float_power(x, exp_val) - z_exponent_func_raster = lambda x: np.float_power(x, exp_val) - except ValueError: - if exponent == "gompertz": - z_exponent_func_float = gen_gompertz - z_exponent_func_raster = numpy_gompertz - else: - sys.exit(f"unrecognised exponent {exponent}") + # snakemake demands we write a file to show we've done something, even if there + # is no tiff generated + sentinel_path = output_path.parent / f".{taxid}_{season}.done" match season: - case Season.RESIDENT: - filename = f"{taxid}_{season.name}.tif" + case "RESIDENT": + filename = f"aoh_{taxid}_{season}.tif" try: - current = open_layer_as_float64(os.path.join(current_aohs_path, filename)) + current, current_aoh = open_layer(current_aohs_path / filename) except FileNotFoundError: - print(f"Failed to open current layer {os.path.join(current_aohs_path, filename)}") - sys.exit() + print(f"Failed to open current layer {current_aohs_path / filename}", file=sys.stderr) + sentinel_path.touch() + return try: - scenario = open_layer_as_float64(os.path.join(scenario_aohs_path, filename)) + scenario: yg.YirgacheffeLayer | float + scenario, _ = open_layer(scenario_aohs_path / filename) except FileNotFoundError: # If there is a current but now scenario file it's because the species went extinct under the scenario - scenario = ConstantLayer(0.0) + scenario = 0.0 try: - historic_aoh = RasterLayer.layer_from_file(os.path.join(historic_aohs_path, filename)).sum() + _, historic_aoh = open_layer(historic_aohs_path / filename) except FileNotFoundError: - print(f"Failed to open historic layer {os.path.join(historic_aohs_path, filename)}") - sys.exit() + print(f"Failed to open historic layer {historic_aohs_path / filename}", file=sys.stderr) + sentinel_path.touch() + return if historic_aoh == 0.0: - print(f"Historic AoH for {taxid} is zero, aborting") - sys.exit() + print(f"Historic AoH for {taxid} is zero, skipping", file=sys.stderr) + sentinel_path.touch() + return - # print(f"current: {current.sum()}\nscenario: {scenario.sum()}\nhistoric: {historic_aoh.sum()}") + old_persistence = calc_persistence_value(current_aoh, historic_aoh, exponent) - layers = [current, scenario] - union = RasterLayer.find_union(layers) + + # In general Yirgacheffe can infer the behaviour needed for area intersections based on + # operator, but in this instance we want to force the caclulation to take place for the + # union of the areas involved. + layers = [x for x in [current, scenario] if isinstance(x, yg.YirgacheffeLayer)] + union = yg.layers.RasterLayer.find_union(layers) for layer in layers: - try: - layer.set_window_for_union(union) - except ValueError: - pass + layer.set_window_for_union(union) - current_aoh = current.sum() + new_p_layer = process_delta_p(current, scenario, current_aoh, historic_aoh, exponent) - new_p_layer = process_delta_p(current, scenario, current_aoh, historic_aoh, z_exponent_func_raster) - print(new_p_layer.sum()) + delta_p = new_p_layer - old_persistence - old_persistence = calc_persistence_value(current_aoh, historic_aoh, z_exponent_func_float) - print(old_persistence) - calc = new_p_layer - ConstantLayer(old_persistence) + try: + delta_p.to_geotiff(output_path) + except ValueError: + print(f"Failed to align layers for {taxid}_{season}", file=sys.stderr) + sentinel_path.touch() + return - with TemporaryDirectory() as tmpdir: - tmpfile = os.path.join(tmpdir, filename) - with RasterLayer.empty_raster_layer_like(new_p_layer, filename=tmpfile) as delta_p: - calc.save(delta_p) - shutil.move(tmpfile, os.path.join(output_folder, filename)) + sentinel_path.touch() - case Season.NONBREEDING: - nonbreeding_filename = f"{taxid}_{Season.NONBREEDING.name}.tif" - breeding_filename = f"{taxid}_{Season.BREEDING.name}.tif" + case "NONBREEDING": + nonbreeding_filename = f"aoh_{taxid}_NONBREEDING.tif" + breeding_filename = f"aoh_{taxid}_BREEDING.tif" try: - with RasterLayer.layer_from_file(os.path.join(historic_aohs_path, breeding_filename)) as aoh: - historic_aoh_breeding = aoh.sum() + _, historic_aoh_breeding = open_layer(historic_aohs_path / breeding_filename) if historic_aoh_breeding == 0.0: - print(f"Historic AoH breeding for {taxid} is zero, aborting") - sys.exit() + print(f"Historic AoH breeding for {taxid} is zero, skipping", file=sys.stderr) + sentinel_path.touch() + return except FileNotFoundError: - print(f"Historic AoH for breeding {taxid} not found, aborting") - sys.exit() + print(f"Historic AoH for breeding {taxid} not found, skipping", file=sys.stderr) + sentinel_path.touch() + return try: - with RasterLayer.layer_from_file(os.path.join(historic_aohs_path, nonbreeding_filename)) as aoh: - historic_aoh_non_breeding = aoh.sum() + _, historic_aoh_non_breeding = open_layer(historic_aohs_path / nonbreeding_filename) if historic_aoh_non_breeding == 0.0: - print(f"Historic AoH for non breeding {taxid} is zero, aborting") - sys.exit() + print(f"Historic AoH for non breeding {taxid} is zero, skipping", file=sys.stderr) + sentinel_path.touch() + return except FileNotFoundError: - print(f"Historic AoH for non breeding {taxid} not found, aborting") - sys.exit() + print(f"Historic AoH for non breeding {taxid} not found, skipping", file=sys.stderr) + sentinel_path.touch() + return - - if scenario_aohs_path != "nan": - non_breeding_scenario_path = os.path.join(scenario_aohs_path, nonbreeding_filename) - breeding_scenario_path = os.path.join(scenario_aohs_path, breeding_filename) + if scenario_aohs_path.name != "nan": + non_breeding_scenario_path = scenario_aohs_path / nonbreeding_filename + breeding_scenario_path = scenario_aohs_path / breeding_filename else: - non_breeding_scenario_path = "nan" - breeding_scenario_path = "nan" + non_breeding_scenario_path = Path("nan") # nan path is the sentinel from csv inputs + breeding_scenario_path = Path("nan") try: - current_breeding = open_layer_as_float64(os.path.join(current_aohs_path, breeding_filename)) + current_breeding, current_aoh_breeding = open_layer(current_aohs_path / breeding_filename) except FileNotFoundError: - print(f"Failed to open current breeding {os.path.join(current_aohs_path, breeding_filename)}") - sys.exit() + print(f"Failed to open current breeding {current_aohs_path / breeding_filename}", file=sys.stderr) + sentinel_path.touch() + return try: - current_non_breeding = open_layer_as_float64(os.path.join(current_aohs_path, nonbreeding_filename)) + current_non_breeding, current_aoh_non_breeding = open_layer(current_aohs_path / nonbreeding_filename) except FileNotFoundError: - print(f"Failed to open current non breeding {os.path.join(current_aohs_path, nonbreeding_filename)}") - sys.exit() + print(f"Failed to open current non breeding {current_aohs_path / nonbreeding_filename}", + file=sys.stderr) + sentinel_path.touch() + return try: - scenario_breeding = open_layer_as_float64(breeding_scenario_path) + scenario_breeding: yg.YirgacheffeLayer | float + scenario_breeding, _ = open_layer(breeding_scenario_path) except FileNotFoundError: # If there is a current but now scenario file it's because the species went extinct under the scenario - scenario_breeding = ConstantLayer(0.0) + scenario_breeding = 0.0 try: - scenario_non_breeding = open_layer_as_float64(non_breeding_scenario_path) + scenario_non_breeding: yg.YirgacheffeLayer | float + scenario_non_breeding, _ = open_layer(non_breeding_scenario_path) except FileNotFoundError: # If there is a current but now scenario file it's because the species went extinct under the scenario - scenario_non_breeding = ConstantLayer(0.0) - - layers = [current_breeding, current_non_breeding, scenario_breeding, scenario_non_breeding] - union = RasterLayer.find_union(layers) - for layer in layers: - try: - layer.set_window_for_union(union) - except ValueError: - pass + scenario_non_breeding = 0.0 - current_aoh_breeding = current_breeding.sum() persistence_breeding = calc_persistence_value( current_aoh_breeding, historic_aoh_breeding, - z_exponent_func_float + exponent, ) - current_aoh_non_breeding = current_non_breeding.sum() persistence_non_breeding = calc_persistence_value( current_aoh_non_breeding, historic_aoh_non_breeding, - z_exponent_func_float + exponent, ) old_persistence = (persistence_breeding ** 0.5) * (persistence_non_breeding ** 0.5) + # In general Yirgacheffe can infer the behaviour needed for area intersections based on + # operator, but in this instance we want to force the calculation to take place for the + # union of the areas involved. + src_layers = [current_breeding, scenario_breeding, current_non_breeding, scenario_non_breeding] + layers = [x for x in src_layers if isinstance(x, yg.YirgacheffeLayer)] + union = yg.layers.RasterLayer.find_union(layers) + for layer in layers: + layer.set_window_for_union(union) + new_p_breeding = process_delta_p( current_breeding, scenario_breeding, current_aoh_breeding, historic_aoh_breeding, - z_exponent_func_raster + exponent, ) new_p_non_breeding = process_delta_p( current_non_breeding, scenario_non_breeding, current_aoh_non_breeding, historic_aoh_non_breeding, - z_exponent_func_raster + exponent, ) new_p_layer = (new_p_breeding ** 0.5) * (new_p_non_breeding ** 0.5) - delta_p_layer = new_p_layer - ConstantLayer(old_persistence) - - with TemporaryDirectory() as tmpdir: - tmpfile = os.path.join(tmpdir, nonbreeding_filename) - with RasterLayer.empty_raster_layer_like(new_p_breeding, filename=tmpfile) as output: - delta_p_layer.save(output) - shutil.move(tmpfile, os.path.join(output_folder, nonbreeding_filename)) + delta_p_layer = new_p_layer - old_persistence - case Season.BREEDING: - pass # covered by the nonbreeding case + try: + delta_p_layer.to_geotiff(output_path) + except ValueError: + print(f"Failed to align layers for {taxid}_{season}", file=sys.stderr) + sentinel_path.touch() + return + + sentinel_path.touch() + case "BREEDING": + # covered by the nonbreeding case + sentinel_path.touch() case _: + sentinel_path.touch() sys.exit(f"Unexpected season for species {taxid}: {season}") +def exponent_type(value: str): + if value == "gompertz": + return value + try: + f = float(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid exponent: {value!r}") from exc + if f not in FLOAT_EXPONENTS: + raise argparse.ArgumentTypeError(f"numeric exponent must be one of {sorted(FLOAT_EXPONENTS)}, got {f}") + return f + +@snakemake_compatible(mapping={ + "taxid": "params.taxon_id", + "season": "params.season", + "current_path": "params.current_path", + "historic_path": "params.pnv_path", + "scenario_path": "params.scenario_path", + "output_path": "params.output_tif", + "exponent": "params.curve", +}) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( - '--speciesdata', + '--taxid', type=str, - help="Single species/seasonality geojson", required=True, - dest="species_data_path" + dest='taxid', + help="Species taxon id", ) parser.add_argument( - '--current_path', + '--season', type=str, required=True, + dest='season', + help="Seasonality", + ) + parser.add_argument( + '--current_path', + type=Path, + required=True, dest="current_path", help="path to species current AOH hex" ) parser.add_argument( '--scenario_path', - type=str, + type=Path, required=True, dest="scenario_path", help="path to species scenario AOH hex" ) parser.add_argument( '--historic_path', - type=str, - required=False, + type=Path, + required=True, dest="historic_path", help="path to species historic AOH hex" ) parser.add_argument('--output_path', - type=str, + type=Path, required=True, dest="output_path", - help="path to save output csv" + help="path to save output tif" + ) + parser.add_argument( + '--z', + dest='exponent', + type=exponent_type, + default=0.25 ) - parser.add_argument('--z', dest='exponent', type=str, default='0.25') args = parser.parse_args() global_code_residents_pixel_ae( - args.species_data_path, + args.taxid, + args.season, args.current_path, args.scenario_path, args.historic_path, diff --git a/local/merge_global_habitat.py b/local/merge_global_habitat.py index 73f247e..aa4d660 100644 --- a/local/merge_global_habitat.py +++ b/local/merge_global_habitat.py @@ -1,32 +1,35 @@ +""" +This script is used to merge localised raster data into a global raster map. + +Specifically it was used originally to merge crosswalked Brazil Mapbiomass data +into the Jung habitat map. +""" + import argparse +from contextlib import nullcontext from pathlib import Path -from yirgacheffe.layers import RasterLayer, RescaledRasterLayer -import yirgacheffe.operators as yo - -from osgeo import gdal -gdal.SetCacheMax(32 * 1024 * 1024) +import yirgacheffe as yg +from alive_progress import alive_bar # type: ignore def merge_global_habitat( global_layer_path: Path, local_layer_path: Path, output_layer_path: Path, + show_progress: bool, ) -> None: # Note, we assume naively the local data is higher resolution than the global layer for now - with RasterLayer.layer_from_file(local_layer_path) as local_layer: - with RescaledRasterLayer.layer_from_file( - global_layer_path, - pixel_scale=local_layer.pixel_scale - ) as global_layer: - local_layer.set_window_for_union(global_layer.area) - cleared = local_layer.nan_to_num() - combined = yo.where(cleared != 0, local_layer, global_layer) - with RasterLayer.empty_raster_layer_like( - combined, - filename=output_layer_path, - datatype=local_layer.datatype - ) as result: - combined.parallel_save(result, parallelism=200) + # In a better world we'd work out which is higher res and make everything in that pixel scale + with ( + yg.read_raster(local_layer_path) as local_layer, + yg.read_raster_like(global_layer_path, local_layer, yg.ResamplingMethod.Nearest) as global_layer, + ): + local_layer.set_window_for_union(global_layer.area) + cleared = local_layer.nan_to_num() + combined = yg.where(cleared != 0, local_layer, global_layer) + ctx = alive_bar(manual=True) if show_progress else nullcontext() + with ctx as bar: + combined.to_geotiff(output_layer_path, callback=bar, parallelism=True) def main() -> None: parser = argparse.ArgumentParser() @@ -51,12 +54,21 @@ def main() -> None: dest="output_layer_path", help="Result combined raster path", ) + parser.add_argument( + '-p', + help="Show progress indicator", + default=False, + required=False, + action='store_true', + dest='show_progress', + ) args = parser.parse_args() merge_global_habitat( args.global_layer_path, args.local_layer_path, args.output_layer_path, + args.show_progress, ) if __name__ == "__main__": diff --git a/prepare_layers/build_gaez_hyde.py b/prepare_layers/build_gaez_hyde.py index de0e632..06f1842 100644 --- a/prepare_layers/build_gaez_hyde.py +++ b/prepare_layers/build_gaez_hyde.py @@ -1,13 +1,9 @@ import argparse -import math import os -import tempfile from pathlib import Path import yirgacheffe as yg -import yirgacheffe.operators as yo - -from make_area_map import make_area_map +from snakemake_argparse_bridge import snakemake_compatible # type: ignore DISAGG_CUTOFF = yg.constant(0.95) @@ -23,35 +19,35 @@ def build_gaez_hyde( assert gaez.map_projection == hyde.map_projection projection = gaez.map_projection - with tempfile.TemporaryDirectory() as tmpdir: - area_raster = Path(tmpdir) / "area.tif" - make_area_map(math.fabs(projection.ystep), area_raster) - - with yg.read_narrow_raster(area_raster) as area: - - portional_hyde = (hyde.nan_to_num() * 1000000) / area - portional_gaez = gaez / 100.0 + with yg.area_raster(projection) as area: + portional_hyde = (hyde.nan_to_num() * 1000000) / area + portional_gaez = gaez / 100.0 - # where gaez and hyde disagree (sum greater than disagg cutoff), scale down - uncapped_total = portional_gaez + portional_hyde - # NaNs stop warnings about divide by zero - uncapped_total_with_nan = yo.where(uncapped_total == 0.0, float("nan"), uncapped_total) + # where gaez and hyde disagree (sum greater than disagg cutoff), scale down + uncapped_total = portional_gaez + portional_hyde + # NaNs stop warnings about divide by zero + uncapped_total_with_nan = yg.where(uncapped_total == 0.0, float("nan"), uncapped_total) - # calculate ag-perc scalars - total = yo.where( - uncapped_total_with_nan >= DISAGG_CUTOFF, - DISAGG_CUTOFF - (yg.constant(1) / yo.exp(uncapped_total_with_nan * 2)), - uncapped_total_with_nan, - ) + # calculate ag-perc scalars + total = yg.where( + uncapped_total_with_nan >= DISAGG_CUTOFF, + DISAGG_CUTOFF - (yg.constant(1) / yg.exp(uncapped_total_with_nan * 2)), + uncapped_total_with_nan, + ) - gaez_ratio = portional_gaez / uncapped_total_with_nan - gaez_values = total * gaez_ratio - gaez_values.to_geotiff(output_dir_path / "crop.tif") + gaez_ratio = portional_gaez / uncapped_total_with_nan + gaez_values = total * gaez_ratio + gaez_values.to_geotiff(output_dir_path / "crop.tif") - hyde_ratio = portional_hyde / uncapped_total_with_nan - hyde_values = total * hyde_ratio - hyde_values.to_geotiff(output_dir_path / "pasture.tif") + hyde_ratio = portional_hyde / uncapped_total_with_nan + hyde_values = total * hyde_ratio + hyde_values.to_geotiff(output_dir_path / "pasture.tif") +@snakemake_compatible(mapping={ + "gaez_path": "input.gaez_raster", + "hyde_path": "input.hyde_raster", + "output_dir_path": "params.output_dir", +}) def main() -> None: parser = argparse.ArgumentParser(description="Generate a combined GAEZ and Hyde layers") parser.add_argument( diff --git a/prepare_layers/generate_crosswalk.py b/prepare_layers/generate_crosswalk.py index e9fcfd9..9e909e5 100644 --- a/prepare_layers/generate_crosswalk.py +++ b/prepare_layers/generate_crosswalk.py @@ -1,8 +1,10 @@ import argparse import os +from pathlib import Path import pandas as pd from iucn_modlib.translator import toJung +from snakemake_argparse_bridge import snakemake_compatible # Take from https://www.iucnredlist.org/resources/habitat-classification-scheme @@ -31,11 +33,9 @@ ] def generate_crosswalk( - output_filename: str, + output_filename: Path, ) -> None: - output_dir, _ = os.path.split(output_filename) - if output_dir: - os.makedirs(output_dir, exist_ok=True) + os.makedirs(output_filename.parent, exist_ok=True) res = [] for iucn_code in IUCN_HABITAT_CODES: @@ -48,11 +48,14 @@ def generate_crosswalk( df = pd.DataFrame(res, columns=["code", "value"]) df.to_csv(output_filename, index=False) +@snakemake_compatible(mapping={ + "output_filename": "output.crosswalk", +}) def main() -> None: parser = argparse.ArgumentParser(description="Generate a Jung crosswalk table as CSV.") parser.add_argument( '--output', - type=str, + type=Path, help='Path where final crosswalk should be stored', required=True, dest='output_filename', diff --git a/prepare_layers/make_arable_map.py b/prepare_layers/make_arable_map.py index ab7ca93..20aa6da 100644 --- a/prepare_layers/make_arable_map.py +++ b/prepare_layers/make_arable_map.py @@ -1,46 +1,44 @@ import argparse +import os +import shutil from pathlib import Path from typing import Optional -import yirgacheffe.operators as yo +import yirgacheffe as yg from alive_progress import alive_bar -from yirgacheffe.layers import RasterLayer - -from osgeo import gdal -gdal.SetCacheMax(1 * 1024 * 1024 * 1024) JUNG_ARABLE_CODE = 1401 JUNG_URBAN_CODE = 1405 def make_arable_map( - current_path: Path, + current_dir_path: Path, output_path: Path, concurrency: Optional[int], show_progress: bool, ) -> None: - with RasterLayer.layer_from_file(current_path) as current: + os.makedirs(output_path, exist_ok=True) - arable_map = yo.where(current != JUNG_URBAN_CODE, JUNG_ARABLE_CODE, JUNG_URBAN_CODE) + # In this scenario all land that isn't urban is covered to arable + urban_filename = current_dir_path / f"lcc_{JUNG_URBAN_CODE}.tif" + new_arable_filename = output_path / f"lcc_{JUNG_ARABLE_CODE}.tif" - with RasterLayer.empty_raster_layer_like( - arable_map, - filename=output_path, - threads=16 - ) as result: - if show_progress: - with alive_bar(manual=True) as bar: - arable_map.parallel_save(result, callback=bar, parallelism=concurrency) - else: - arable_map.parallel_save(result, parallelism=concurrency) + shutil.copy(urban_filename, output_path) + with yg.read_raster(urban_filename) as urban: + new_arable = 1.0 - urban + if show_progress: + with alive_bar(manual=True) as bar: + new_arable.to_geotiff(new_arable_filename, callback=bar, parallelism=concurrency) + else: + new_arable.to_geotiff(new_arable_filename, parallelism=concurrency) def main() -> None: parser = argparse.ArgumentParser(description="Generate the arable scenario map.") parser.add_argument( '--current', type=Path, - help='Path of current map', + help='Path of fractional current maps', required=True, - dest='current_path', + dest='current_dir_path', ) parser.add_argument( '--output', @@ -68,7 +66,7 @@ def main() -> None: args = parser.parse_args() make_arable_map( - args.current_path, + args.current_dir_path, args.results_path, args.concurrency, args.show_progress, diff --git a/prepare_layers/make_area_map.py b/prepare_layers/make_area_map.py deleted file mode 100644 index 3a2ca29..0000000 --- a/prepare_layers/make_area_map.py +++ /dev/null @@ -1,91 +0,0 @@ -import argparse -import math -from pathlib import Path - -import numpy as np -from yirgacheffe.layers import RasterLayer # type: ignore -from yirgacheffe.operators import DataType # type: ignore -from yirgacheffe.window import Area, PixelScale # type: ignore - -# Taken from https://gis.stackexchange.com/questions/127165/more-accurate-way-to-calculate-area-of-rasters -def area_of_pixel(pixel_size: float, center_lat: float) -> float: - """Calculate m^2 area of a wgs84 square pixel. - - Adapted from: https://gis.stackexchange.com/a/127327/2397 - - Parameters: - pixel_size (float): length of side of pixel in degrees. - center_lat (float): latitude of the center of the pixel. Note this - value +/- half the `pixel-size` must not exceed 90/-90 degrees - latitude or an invalid area will be calculated. - - Returns: - Area of square pixel of side length `pixel_size` centered at - `center_lat` in m^2. - - """ - a = 6378137 # meters - b = 6356752.3142 # meters - e = math.sqrt(1 - (b/a)**2) - area_list = [] - for f in [center_lat+pixel_size/2, center_lat-pixel_size/2]: - zm = 1 - e*math.sin(math.radians(f)) - zp = 1 + e*math.sin(math.radians(f)) - area_list.append( - math.pi * b**2 * ( - math.log(zp/zm) / (2*e) + - math.sin(math.radians(f)) / (zp*zm))) - return pixel_size / 360. * (area_list[0] - area_list[1]) - -def make_area_map( - pixel_scale: float, - output_path: Path, -) -> None: - pixels = [0.0,] * math.floor(90.0 / pixel_scale) - for i in range(len(pixels)): # pylint: disable=C0200 - y = (i + 0.5) * pixel_scale - pixels[i] = area_of_pixel(pixel_scale, y) - - allpixels = np.rot90(np.array([list(reversed(pixels)) + pixels])) - - area = Area( - left=0, - right=pixel_scale, - top=(math.floor(90 / pixel_scale) * pixel_scale), - bottom=(math.floor(90 / pixel_scale) * pixel_scale * -1.0) - ) - with RasterLayer.empty_raster_layer( - area, - PixelScale(pixel_scale, pixel_scale * -1.0), - DataType.Float32, - filename=output_path - ) as res: - assert res.window.xsize == 1 - res._dataset.WriteArray(allpixels, 0, 0) # pylint: disable=W0212 - - -def main() -> None: - parser = argparse.ArgumentParser(description="Downsample habitat map to raster per terrain type.") - parser.add_argument( - "--scale", - type=float, - required=True, - dest="pixel_scale", - help="Output pixel scale value." - ) - parser.add_argument( - "--output", - type=str, - required=True, - dest="output_path", - help="Destination file for area raster." - ) - args = parser.parse_args() - - make_area_map( - args.pixel_scale, - args.output_path - ) - -if __name__ == "__main__": - main() diff --git a/prepare_layers/make_current_map.py b/prepare_layers/make_current_map.py index 519e0b4..7be9547 100644 --- a/prepare_layers/make_current_map.py +++ b/prepare_layers/make_current_map.py @@ -1,17 +1,24 @@ import argparse import itertools +import logging +import math +import os +from contextlib import nullcontext from pathlib import Path from multiprocessing import set_start_method -from typing import Dict, List, Optional import pandas as pd -import yirgacheffe.operators as yo # type: ignore +import yirgacheffe as yg from alive_progress import alive_bar # type: ignore -from yirgacheffe.layers import RasterLayer # type: ignore +from snakemake_argparse_bridge import snakemake_compatible # type: ignore from osgeo import gdal # type: ignore gdal.SetCacheMax(1 * 1024 * 1024 * 1024) +logger = logging.getLogger(__name__) +logging.basicConfig() +logger.setLevel(logging.INFO) + # From Eyres et al: The current layer maps IUCN level 1 and 2 habitats, but habitats in the PNV layer are mapped # only at IUCN level 1, so to estimate species’ proportion of original AOH now remaining we could only use natural # habitats mapped at level 1 and artificial habitats at level 2. @@ -19,9 +26,9 @@ "14", "14.1", "14.2", "14.3", "14.4", "14.5", "14.6" ] -def load_crosswalk_table(table_file_name: Path) -> Dict[str,List[int]]: +def load_crosswalk_table(table_file_name: Path) -> dict[str,list[int]]: rawdata = pd.read_csv(table_file_name) - result: Dict[str,List[int]] = {} + result: dict[str,list[int]] = {} for _, row in rawdata.iterrows(): try: result[row.code].append(int(row.value)) @@ -29,49 +36,77 @@ def load_crosswalk_table(table_file_name: Path) -> Dict[str,List[int]]: result[row.code] = [int(row.value)] return result - -def make_current_map( +def make_current_maps( jung_path: Path, - update_masks_path: Optional[Path], + update_masks_path: Path | None, crosswalk_path: Path, - output_path: Path, - concurrency: Optional[int], + output_dir_path: Path, + parallelism: int | None, show_progress: bool, + sentinel_path: Path | None, ) -> None: + os.makedirs(output_dir_path, exist_ok=True) + if parallelism: + logger.info("Using %d workers", parallelism) + else: + logger.info("No parallelism specified") if update_masks_path is not None: update_masks = [ - RasterLayer.layer_from_file(x) for x in sorted(list(update_masks_path.glob("*.tif"))) + yg.read_raster(x) for x in sorted(list(update_masks_path.glob("*.tif"))) ] else: update_masks = [] - with RasterLayer.layer_from_file(jung_path) as jung: + with yg.read_raster(jung_path) as jung: crosswalk = load_crosswalk_table(crosswalk_path) map_preserve_code = list(itertools.chain.from_iterable([crosswalk[x] for x in IUCN_CODE_ARTIFICAL])) updated_jung = jung for update in update_masks: - updated_jung = yo.where(update != 0, update, updated_jung) + updated_jung = yg.where(update.isnan(), updated_jung, update) - current_map = yo.where( + current_map = yg.where( updated_jung.isin(map_preserve_code), updated_jung, - (yo.floor(updated_jung / 100) * 100).astype(yo.DataType.UInt16), + (yg.floor(updated_jung / 100) * 100), ) - with RasterLayer.empty_raster_layer_like( - jung, - filename=output_path, - threads=16 - ) as result: - if show_progress: - with alive_bar(manual=True) as bar: - current_map.parallel_save(result, callback=bar, parallelism=concurrency) - else: - current_map.parallel_save(result, parallelism=concurrency) - + logger.info("Calculating unique land cover types...") + vals = current_map.unique() + logger.info("Found %s land cover classes", set(vals)) + + for lcc in vals: + # Seems there are some NaN values in Jung + if math.isnan(lcc): + continue + logger.info("Processing %s...", lcc) + per_class = current_map == lcc + cast_per_class = per_class.astype(yg.DataType.Float32) + ctx = alive_bar(manual=True, title=str(lcc)) if show_progress else nullcontext() + with ctx as bar: + cast_per_class.to_geotiff( + output_dir_path / f"lcc_{int(lcc)}.tif", + callback=bar, + parallelism=parallelism, + ) + + # This script generates a bunch of rasters, but snakemake needs one + # output to say when this is done, so if we're in snakemake mode we touch a sentinel file to + # let it know we've done. One day this should be another decorator. + if sentinel_path is not None: + os.makedirs(sentinel_path.parent, exist_ok=True) + sentinel_path.touch() + +@snakemake_compatible(mapping={ + "jung_path": "input.habitat", + "update_masks_path": "params.updates_dir", + "crosswalk_path": "input.crosswalk", + "parallelism": "threads", + "output_dir_path": "params.output_dir", + "sentinel_path": "output.sentinel", +}) def main() -> None: set_start_method("spawn") @@ -102,15 +137,15 @@ def main() -> None: type=Path, help='Path where final map should be stored', required=True, - dest='results_path', + dest='output_dir_path', ) parser.add_argument( '-j', type=int, - help='Number of concurrent threads to use for calculation.', + help='Number of parallel threads to use for calculation.', required=False, default=None, - dest='concurrency', + dest='parallelism', ) parser.add_argument( '-p', @@ -120,15 +155,24 @@ def main() -> None: action='store_true', dest='show_progress', ) + parser.add_argument( + '--sentinel', + type=Path, + help='Generate a sentinel file on completion for snakemake to track', + required=False, + default=None, + dest='sentinel_path', + ) args = parser.parse_args() - make_current_map( + make_current_maps( args.jung_path, args.update_masks_path, args.crosswalk_path, - args.results_path, - args.concurrency, + args.output_dir_path, + args.parallelism, args.show_progress, + args.sentinel_path, ) if __name__ == "__main__": diff --git a/prepare_layers/make_diff_map.py b/prepare_layers/make_diff_map.py index 6624880..e9fe7c6 100644 --- a/prepare_layers/make_diff_map.py +++ b/prepare_layers/make_diff_map.py @@ -1,131 +1,72 @@ import argparse -import shutil -import tempfile +from contextlib import nullcontext from pathlib import Path -from typing import Optional -from alive_progress import alive_bar -from osgeo import gdal -from yirgacheffe.layers import RasterLayer, UniformAreaLayer -from yirgacheffe.operators import DataType +import yirgacheffe as yg +from alive_progress import alive_bar # type: ignore +from snakemake_argparse_bridge import snakemake_compatible # type: ignore -gdal.SetCacheMax(512 * 1024 * 1024) +POSSIBLE_HABITAT_CLASSES = [100, 200, 300, 400, 500, 600, 700, 800, 900, + 1000, 1100, 1200, 1300, 1400, 1401, 1402, 1403, 1404, + 1405, 1406, 1500, 1600, 1800] def make_diff_map( current_path: Path, scenario_path: Path, - area_path: Path, - pixel_scale: float, - target_projection: Optional[str], output_path: Path, - concurrency: Optional[int], + parallelism: None | int, show_progress: bool, ) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir_path = Path(tmpdir) - raw_map_filename = tmpdir_path / "raw.tif" - print("comparing:") - with RasterLayer.layer_from_file(current_path) as current: - with RasterLayer.layer_from_file(scenario_path) as scenario: + layers = [] + for habitat in POSSIBLE_HABITAT_CLASSES: + current_habitat_filename = current_path / f"lcc_{habitat}.tif" + scenario_habitat_filename = scenario_path / f"lcc_{habitat}.tif" - diff_map = current != scenario + if not current_habitat_filename.exists() and not scenario_habitat_filename.exists(): + continue + current_layer = yg.read_raster(current_habitat_filename) if current_habitat_filename.exists() \ + else yg.constant(0.0) + scenario_layer = yg.read_raster(scenario_habitat_filename) if scenario_habitat_filename.exists() \ + else yg.constant(0.0) + habitat_diff = current_layer != scenario_layer + layers.append(habitat_diff) - gdal.SetCacheMax(512 * 1024 * 1024) - with RasterLayer.empty_raster_layer_like( - diff_map, - filename=raw_map_filename, - datatype=DataType.Float32, - threads=16 - ) as result: - if show_progress: - with alive_bar(manual=True) as bar: - diff_map.parallel_save(result, callback=bar, parallelism=concurrency) - else: - diff_map.parallel_save(result, parallelism=concurrency) - - gdal.SetCacheMax(256 * 1024 * 1024 * 1024) - rescaled_map_filename = tmpdir_path / "rescaled.tif" - print("reprojecting:") - with alive_bar(manual=True) as bar: - gdal.Warp(rescaled_map_filename, raw_map_filename, options=gdal.WarpOptions( - creationOptions=['COMPRESS=LZW', 'NUM_THREADS=16'], - multithread=True, - dstSRS=target_projection, - outputType=gdal.GDT_Float32, - xRes=pixel_scale, - yRes=0.0 - pixel_scale, - resampleAlg="average", - workingType=gdal.GDT_Float32, - callback=lambda a, _b, _c: bar(a), # pylint: disable=E1102 - )) - - print("scaling result:") - with UniformAreaLayer.layer_from_file(area_path) as area_map: - with RasterLayer.layer_from_file(rescaled_map_filename) as diff_map: - - area_adjusted_map_filename = tmpdir_path / "final.tif" - final = area_map * diff_map - gdal.SetCacheMax(512 * 1024 * 1024) - - with RasterLayer.empty_raster_layer_like( - final, - filename=area_adjusted_map_filename, - datatype=gdal.GDT_Float32, - threads=16 - ) as result: - if show_progress: - with alive_bar(manual=True) as bar: - final.parallel_save(result, callback=bar, parallelism=concurrency) - else: - final.parallel_save(result, parallelism=concurrency) - - shutil.move(area_adjusted_map_filename, output_path) + diff = yg.any(layers) + area = yg.area_raster(diff.map_projection) + scaled_diff = diff * area + ctx = alive_bar(manual=True) if show_progress else nullcontext() + with ctx as bar: + scaled_diff.to_geotiff(output_path, callback=bar, parallelism=parallelism) +@snakemake_compatible(mapping={ + "current_path": "input.current", + "scenario_path": "params.scenario", + "parallelism": "threads", + "output_path": "output[0]", +}) def main() -> None: parser = argparse.ArgumentParser(description="Generate an area difference map.") parser.add_argument( '--current', type=Path, - help='Path of current map', + help='Path of current fractional maps', required=True, dest='current_path', ) parser.add_argument( '--scenario', type=Path, - help='Path of the scenario map', + help='Path of the scenario fractional maps', required=True, dest='scenario_path', ) - parser.add_argument( - '--area', - type=Path, - help='Path of the area per pixel map', - required=True, - dest='area_path', - ) - parser.add_argument( - "--scale", - type=float, - required=True, - dest="pixel_scale", - help="Output pixel scale value." - ) - parser.add_argument( - '--projection', - type=str, - help="Target projection", - required=False, - dest="target_projection", - default=None - ) parser.add_argument( '--output', type=Path, help='Path where final map should be stored', required=True, - dest='results_path', + dest='output_path', ) parser.add_argument( '-j', @@ -133,7 +74,7 @@ def main() -> None: help='Number of concurrent threads to use for calculation.', required=False, default=None, - dest='concurrency', + dest='parallelism', ) parser.add_argument( '-p', @@ -148,11 +89,8 @@ def main() -> None: make_diff_map( args.current_path, args.scenario_path, - args.area_path, - args.pixel_scale, - args.target_projection, - args.results_path, - args.concurrency, + args.output_path, + args.parallelism, args.show_progress, ) diff --git a/prepare_layers/make_food_current_map.py b/prepare_layers/make_food_current_map.py index 0b7e876..483476a 100644 --- a/prepare_layers/make_food_current_map.py +++ b/prepare_layers/make_food_current_map.py @@ -1,18 +1,18 @@ import argparse -import math +import multiprocessing import os import resource import sys import time from pathlib import Path -from multiprocessing import Manager, Process, cpu_count +from multiprocessing import Process, cpu_count from queue import Queue from typing import NamedTuple from osgeo import gdal import numpy as np import yirgacheffe as yg -from yirgacheffe.layers import RasterLayer +from snakemake_argparse_bridge import snakemake_compatible # type: ignore gdal.SetCacheMax(4 * 1024 * 1024 * 1024) @@ -22,91 +22,237 @@ # Codes not to touch. We're currently working at Level 1 except for artificial which is level 2 PRESERVE_CODES = [600, 700, 900, 1000, 1100, 1200, 1300, 1405] +# PNV codes +# array([ 100, 200, 300, 400, 500, 600, 800, 900, 1100, 1200], dtype=uint16) + class TileInfo(NamedTuple): """Info about a tile to process""" x_position : int y_position : int width : int height : int - crop_diff : float - pasture_diff : float + crop_target : float + pasture_target : float + +def balance_crop_and_pasture_differences( + crop_diff: float, + pasture_diff: float, + lcc_data_map: dict[int,np.ndarray], +) -> tuple[float,float]: + """ + If we remove one type of agricultural land but expand another, keep them in the same area where possible. + """ + # Either both are a reduction or both an increase, or at least one is null, so no + # balancing required. + if crop_diff * pasture_diff >= 0: + return crop_diff, pasture_diff + + # If balanced they will cancel each other out, otherwise + # we will move the smaller difference from one to the other. + transfer_amount = min(abs(crop_diff), abs(pasture_diff)) + + if crop_diff > 0: + # Crop increasing, pasture decreasing + src_lcc, dst_lcc = PASTURE_CODE, CROP_CODE + else: + # Pasture increasing, crop decreasing + src_lcc, dst_lcc = CROP_CODE, PASTURE_CODE + + src_raster = lcc_data_map[src_lcc] + dst_raster = lcc_data_map[dst_lcc] + + total_cells = src_raster.size + + transfer_mask = src_raster > 0 + src_cells = src_raster.sum() + if src_cells == 0: + if crop_diff > 0: + raise ValueError(f"not cells in pasture {src_raster.sum()}, but pasture diff is -ve {pasture_diff}") + raise ValueError(f"not cells in crop {src_raster.sum()}, but crop diff is -ve {crop_diff}") + + src_coverage = src_raster.sum() / total_cells + + # Per-cell reduction factor: what fraction of each cell's current value to move + # This is safe because our simplifying assumption guarantees transfer <= source_coverage + per_cell_factor = transfer_amount / src_coverage + + transferred = transfer_mask * per_cell_factor + src_raster -= transferred + dst_raster += transferred + + new_crop_diff = crop_diff + transfer_amount * np.sign(pasture_diff) + new_pasture_diff = pasture_diff + transfer_amount * np.sign(crop_diff) + return new_crop_diff, new_pasture_diff + +def remove_land_cover( + lcc_code: int, + diff: float, + pnv: np.ndarray, + lcc_data_map: dict[int,np.ndarray], +) -> None: + assert diff <= 0 + diff = abs(diff) -def process_tile( - current: yg.layers.RasterLayer, - pnv: yg.layers.RasterLayer, - tile: TileInfo, - random_seed: int, -) -> np.ndarray: + agri_raster = lcc_data_map[lcc_code] + removal_mask = agri_raster > 0 - rng = np.random.default_rng(random_seed) + current_coverage = agri_raster.sum() / agri_raster.size + if current_coverage == 0: + return - data = current.read_array(tile.x_position, tile.y_position, tile.width, tile.height) + per_cell_fraction = min(diff / current_coverage, 1.0) - diffs = [ - (tile.crop_diff, CROP_CODE), - (tile.pasture_diff, PASTURE_CODE), - ] - diffs.sort(key=lambda a: a[0]) + removed_grid = agri_raster * (removal_mask * per_cell_fraction) + agri_raster -= removed_grid - # Ordered so removes will come first - for diff_value, habitat_code in diffs: - if np.isnan(diff_value): - continue - required_points = math.floor(data.shape[0] * data.shape[1] * math.fabs(diff_value)) - if required_points == 0: - continue + # Reallocate to PNV classes - note we assume this does not include the agricultural classes + # so as to not undo what we just did! + for lcc, lcc_data in lcc_data_map.items(): + pnv_match = (pnv == lcc) & removal_mask + lcc_data[pnv_match] += removed_grid[pnv_match] - if diff_value > 0: - valid_mask = ~np.isin(data, [CROP_CODE, PASTURE_CODE] + PRESERVE_CODES) - else: - valid_mask = data == habitat_code +def add_land_cover( + eligible_mask: np.ndarray, + diffs: list[tuple[float, int]], + lcc_data_map: dict[int,np.ndarray], +) -> None: - valid_locations = valid_mask.nonzero() - possible_points = len(valid_locations[0]) - if possible_points == 0: + # Calculate capacity + eligible_count = eligible_mask.sum() + if eligible_count == 0: + return + total_cells = eligible_mask.size + eligible_fraction = eligible_count / total_cells + if eligible_fraction == 0: + return + + total_addition = 0 + for diff, lcc_code in diffs: + assert 0 <= diff <= 1 + + agri_raster = lcc_data_map[lcc_code] + + per_cell_addition = diff / eligible_fraction + per_cell_addition = min(per_cell_addition, 1.0) + total_addition += per_cell_addition + + agri_raster[eligible_mask] += per_cell_addition + + # Remove from the other land cover classes. This assumes that coming into this + # stage the LCC pixels are: + # * only non-zero in a single layer + # * only starting at 100% + for lcc, lcc_data in lcc_data_map.items(): + if lcc in [CROP_CODE, PASTURE_CODE] + PRESERVE_CODES: continue - required_points = min(required_points, possible_points) + lcc_data[eligible_mask & (lcc_data > 0)] -= total_addition + lcc_data[eligible_mask] = np.maximum(lcc_data[eligible_mask], 0.0) + +def process_tile( + current_maps: dict[int,yg.YirgacheffeLayer], + pnv: yg.YirgacheffeLayer, + tile: TileInfo, +) -> dict[int,np.ndarray]: + lcc_data_map = { + lcc: current_map.read_array(tile.x_position, tile.y_position, tile.width, tile.height) + for lcc, current_map in current_maps.items() + } + + if np.isnan(tile.crop_target) and np.isnan(tile.pasture_target): + return lcc_data_map + + for current in current_maps.values(): + assert current.map_projection == pnv.map_projection + assert current.area == pnv.area + + if not np.isnan(tile.crop_target): + crop_data = lcc_data_map[CROP_CODE] + crop_diff = tile.crop_target - (crop_data.sum() / crop_data.size) + assert not np.isnan(crop_diff) + else: + crop_diff = 0 + if not np.isnan(tile.pasture_target): + pasture_data = lcc_data_map[PASTURE_CODE] + pasture_diff = tile.pasture_target - (pasture_data.sum() / pasture_data.size) + assert not np.isnan(pasture_diff) + else: + pasture_diff = 0 + if (crop_diff == 0) and (pasture_diff == 0): + return lcc_data_map + + crop_diff, pasture_diff = balance_crop_and_pasture_differences( + crop_diff, + pasture_diff, + lcc_data_map, + ) + + # We first do all the removals and then the additions + diffs = [ + (crop_diff, CROP_CODE), + (pasture_diff, PASTURE_CODE), + ] + removals = [x for x in diffs if x[0] < 0] + additions = [x for x in diffs if x[0] > 0] + + pnv_data = None # lazy PNV load as it's expensive + for diff_value, habitat_code in removals: + if pnv_data is None: + pnv_data = pnv.read_array(tile.x_position, tile.y_position, tile.width, tile.height) + remove_land_cover(habitat_code, diff_value, pnv_data, lcc_data_map) + + # If there's no additions we don't need to make the eligible_mask, and we can go + # home early. + if not additions: + return lcc_data_map + + # Find areas we can put the new data. This is anywhere we don't already + # have agricultural land, and other places unlikely to be converted (cities, lakes, etc.) + # We know that there should be no partial cells involving crop/pasture at this stage + # because of the balancing we did initially. + excluded_codes = [CROP_CODE, PASTURE_CODE] + PRESERVE_CODES + eligible_mask = np.ones_like(lcc_data_map[CROP_CODE], dtype=bool) + for excluded_code in excluded_codes: + if excluded_code in lcc_data_map: + eligible_mask &= (lcc_data_map[excluded_code] == 0) + + # There is a risk that the total is not achievable as there is a disagreement between the combined + # GAEZ/HYDE, Jung, and our PRESERVE_CODES list (that say don't covert urban or rocky land to farmland). + # As such we need to adjust for what is actually achievable before we make changes. + available_cells = eligible_mask.sum() / eligible_mask.size + total_desired_change = sum(x[0] for x in additions) + if total_desired_change > available_cells: + # Split what is actually possible proportionally to the original demand + additions = [ + ((change / total_desired_change) * available_cells, klass) + for (change, klass) in additions + ] + + add_land_cover(eligible_mask, additions, lcc_data_map) + + return lcc_data_map - selected_locations = rng.choice( - len(valid_locations[0]), - size=required_points, - replace=False - ) - rows = valid_locations[0][selected_locations] - cols = valid_locations[1][selected_locations] - if diff_value > 0: - data[rows, cols] = habitat_code - else: - for y, x in zip(rows, cols): - absolute_x = x + tile.x_position - absolute_y = y + tile.y_position - lat, lng = current.latlng_for_pixel(absolute_x, absolute_y) - pnv_x, pnv_y = pnv.pixel_for_latlng(lat, lng) - val = pnv.read_array(pnv_x, pnv_y, 1, 1)[0][0] - data[y][x] = val - - return data def process_tile_concurrently( current_lvl1_path: Path, pnv_path: Path, input_queue: Queue, - result_queue: Queue, + result_queues: dict[int,Queue], ) -> None: - with yg.read_raster(current_lvl1_path) as current: - with yg.read_raster(pnv_path) as pnv: - while True: - job : tuple[TileInfo, int] | None = input_queue.get() - if job is None: - break - tile, seed = job - if np.isnan(tile.crop_diff) and np.isnan(tile.pasture_diff): - result_queue.put((tile, None)) - else: - data = process_tile(current, pnv, tile, seed) - result_queue.put((tile, data.tobytes())) - - result_queue.put(None) + current_maps = { + int(filename.stem.split('_')[1]): yg.read_raster(filename) for filename in current_lvl1_path.glob("lcc_*.tif") + } + reference_layer = next(iter(current_maps.values())) + with yg.read_raster(pnv_path) as pnv: + pnv.set_window_for_intersection(reference_layer.area) + while True: + tile : TileInfo | None = input_queue.get() + if tile is None: + break + res = process_tile(current_maps, pnv, tile) + for lcc, data in res.items(): + result_queues[lcc].put((tile, data.tobytes())) + for queue in result_queues.values(): + queue.put(None) def build_tile_list( current_lvl1_path: Path, @@ -114,66 +260,71 @@ def build_tile_list( pasture_adjustment_path: Path, ) -> list[TileInfo]: tiles = [] - with yg.read_raster(current_lvl1_path) as current: - current_dimensions = current.window.xsize, current.window.ysize - with yg.read_raster(crop_adjustment_path) as crop_diff: - with yg.read_raster(pasture_adjustment_path) as pasture_diff: - assert crop_diff.window == pasture_diff.window - diff_dimensions = crop_diff.window.xsize, crop_diff.window.ysize - - x_scale = current_dimensions[0] / diff_dimensions[0] - y_scale = current_dimensions[1] / diff_dimensions[1] - - x_steps = [round(i * x_scale) for i in range(diff_dimensions[0])] - x_steps.append(current_dimensions[0]) - y_steps = [round(i * y_scale) for i in range(diff_dimensions[1])] - y_steps.append(current_dimensions[1]) - - for y in range(crop_diff.window.ysize): - crop_row = crop_diff.read_array(0, y, crop_diff.window.xsize, 1) - pasture_row = pasture_diff.read_array(0, y, pasture_diff.window.xsize, 1) - for x in range(crop_diff.window.xsize): - tiles.append(TileInfo( - x_steps[x], - y_steps[y], - (x_steps[x+1] - x_steps[x]), - (y_steps[y+1] - y_steps[y]), - crop_row[0][x], - pasture_row[0][x], - )) + + with yg.read_raster(next(current_lvl1_path.glob("*.tif"))) as example: + current_dimensions = example.window.xsize, example.window.ysize + with ( + yg.read_raster(crop_adjustment_path) as crop, + yg.read_raster(pasture_adjustment_path) as pasture, + ): + assert crop.window == pasture.window + argi_dimensions = crop.window.xsize, crop.window.ysize + + x_scale = current_dimensions[0] / argi_dimensions[0] + y_scale = current_dimensions[1] / argi_dimensions[1] + + x_steps = [round(i * x_scale) for i in range(argi_dimensions[0])] + x_steps.append(current_dimensions[0]) + y_steps = [round(i * y_scale) for i in range(argi_dimensions[1])] + y_steps.append(current_dimensions[1]) + + for y in range(crop.window.ysize): + crop_row = crop.read_array(0, y, crop.window.xsize, 1) + pasture_row = pasture.read_array(0, y, pasture.window.xsize, 1) + for x in range(crop.window.xsize): + tiles.append(TileInfo( + x_steps[x], + y_steps[y], + (x_steps[x+1] - x_steps[x]), + (y_steps[y+1] - y_steps[y]), + crop_row[0][x], + pasture_row[0][x], + )) return tiles def assemble_map( + lcc: int, current_lvl1_path: Path, output_path: Path, result_queue: Queue, sentinal_count: int, ) -> None: + os.makedirs(output_path, exist_ok=True) + with yg.read_raster(current_lvl1_path / f"lcc_{lcc}.tif") as current_map: + new_map = yg.layers.RasterLayer.empty_raster_layer_like( + current_map, + filename=output_path / f"lcc_{lcc}.tif", + threads=16, + ) + dtype = current_map.read_array(0, 0, 1, 1).dtype + band = new_map._dataset.GetRasterBand(1) # pylint: disable=W0212 + + count = 0 + while True: + result : tuple[TileInfo,bytearray] | None = result_queue.get() + if result is None: + sentinal_count -= 1 + if sentinal_count == 0: + break + continue - with yg.read_raster(current_lvl1_path) as current: - dtype = current.read_array(0, 0, 1, 1).dtype - with RasterLayer.empty_raster_layer_like(current, filename=output_path) as output: - - # A cheat as we don't have a neat API for this on yirgacheffe yet - band = output._dataset.GetRasterBand(1) # pylint: disable=W0212 - - while True: - result : tuple[TileInfo,bytearray | None] | None = result_queue.get() - if result is None: - sentinal_count -= 1 - if sentinal_count == 0: - break - continue - - tile, rawdata = result - if rawdata is None: - data = current.read_array(tile.x_position, tile.y_position, tile.width, tile.height) - else: - n = np.frombuffer(rawdata, dtype=dtype) - data = np.reshape(n, (tile.height, tile.width)) - - band.WriteArray(data, tile.x_position, tile.y_position) - + count += 1 + tile, rawdata = result + n = np.frombuffer(rawdata, dtype=dtype) + data = np.reshape(n, (tile.height, tile.width)) + band.WriteArray(data, tile.x_position, tile.y_position) + if count % 1000 == 0: + print(f"{lcc}: assembled {count} tiles") def pipeline_source( current_lvl1_path: Path, @@ -181,88 +332,109 @@ def pipeline_source( pasture_adjustment_path: Path, source_queue: Queue, sentinal_count: int, - random_seed: int, ) -> None: - rng = np.random.default_rng(random_seed) - tiles = build_tile_list( current_lvl1_path, crop_adjustment_path, pasture_adjustment_path, ) - seeds = rng.integers(2**63, size=len(tiles)) - for tile, seed in zip(tiles, seeds): - source_queue.put((tile, seed)) + print(f"There are {len(tiles)} tiles") + for tile in tiles: + source_queue.put(tile) for _ in range(sentinal_count): source_queue.put(None) + +def get_lcc_list(current_lvl1_path: Path) -> list[int]: + rasters = current_lvl1_path.glob("*.tif") + return [int(x.stem.split('_')[1]) for x in rasters] + def make_food_current_map( current_lvl1_path: Path, pnv_path: Path, crop_adjustment_path: Path, pasture_adjustment_path: Path, - random_seed: int, output_path: Path, processes_count: int, + sentinel_path: Path | None, ) -> None: # We'll use a lot of processes which will talk back to the main process, so # we need to adjust the ulimit, which is quite low by default _, max_fd_limit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (max_fd_limit, max_fd_limit)) - print(f"Set fd limit to {max_fd_limit}") os.makedirs(output_path.parent, exist_ok=True) - with Manager() as manager: - source_queue = manager.Queue() - result_queue = manager.Queue() - - workers = [Process(target=process_tile_concurrently, args=( - current_lvl1_path, - pnv_path, - source_queue, - result_queue, - )) for _ in range(processes_count)] - for worker_process in workers: - worker_process.start() - - source_worker = Process(target=pipeline_source, args=( - current_lvl1_path, - crop_adjustment_path, - pasture_adjustment_path, - source_queue, - processes_count, - random_seed, - )) - source_worker.start() + lcc_list = get_lcc_list(current_lvl1_path) + result_queues: dict[int,multiprocessing.queues.Queue] = { + lcc: multiprocessing.Queue(maxsize=10) for lcc in lcc_list + } - assemble_map( + assembly_processes = [ + Process(target=assemble_map, args=( + lcc, current_lvl1_path, output_path, - result_queue, + queue, processes_count, - ) + )) for lcc, queue in result_queues.items() + ] + for assembly_worker in assembly_processes: + assembly_worker.start() - processes = workers - processes.append(source_worker) - while processes: - candidates = [x for x in processes if not x.is_alive()] - for candidate in candidates: - candidate.join() - if candidate.exitcode: - for victim in processes: - victim.kill() - sys.exit(candidate.exitcode) - processes.remove(candidate) - time.sleep(0.1) + source_queue: multiprocessing.queues.Queue = multiprocessing.Queue(maxsize=1000) + workers = [Process(target=process_tile_concurrently, args=( + current_lvl1_path, + pnv_path, + source_queue, + result_queues, + )) for _ in range(processes_count)] + for worker_process in workers: + worker_process.start() + + source_worker = Process(target=pipeline_source, args=( + current_lvl1_path, + crop_adjustment_path, + pasture_adjustment_path, + source_queue, + processes_count, + )) + source_worker.start() + + processes = workers + assembly_processes + processes.append(source_worker) + while processes: + candidates = [x for x in processes if not x.is_alive()] + for candidate in candidates: + candidate.join() + if candidate.exitcode: + for victim in processes: + victim.kill() + sys.exit(candidate.exitcode) + processes.remove(candidate) + time.sleep(0.1) + + if sentinel_path: + sentinel_path.touch() + +@snakemake_compatible(mapping={ + "current_lvl1_path": "params.jung_dir", + "pnv_path": "input.pnv", + "crop_adjustment_path": "input.crop", + "pasture_adjustment_path": "input.pasture", + "processes_count": "threads", + "output_path": "params.output_dir", + "sentinel_path": "output.sentinel", + "parallelism": "threads", +}) def main() -> None: parser = argparse.ArgumentParser(description="Build the food current map") parser.add_argument( "--current_lvl1", type=Path, required=True, - help="Path of lvl1 current map", + help="Path of lvl1 current maps", dest="current_lvl1_path", ) parser.add_argument( @@ -273,26 +445,19 @@ def main() -> None: dest='pnv_path', ) parser.add_argument( - "--crop_diff", + "--crop", type=Path, required=True, - help="Path of adjustment for crop diff", + help="Path of crop area", dest="crop_adjustment_path", ) parser.add_argument( - "--pasture_diff", + "--pasture", type=Path, required=True, - help="Path of adjustment for pasture diff", + help="Path of pasture area", dest="pasture_adjustment_path", ) - parser.add_argument( - "--seed", - type=int, - required=True, - help="Seed the random number generator", - dest="seed", - ) parser.add_argument( '--output', type=Path, @@ -300,12 +465,18 @@ def main() -> None: required=True, dest='output_path', ) + parser.add_argument( + '--sentinel', + type=Path, + required=False, + dest='sentinel_path', + ) parser.add_argument( "-j", type=int, required=False, - default=round(cpu_count() / 1), - dest="processes_count", + default=cpu_count() // 2, + dest="parallelism", help="Number of concurrent threads to use." ) args = parser.parse_args() @@ -315,9 +486,9 @@ def main() -> None: args.pnv_path, args.crop_adjustment_path, args.pasture_adjustment_path, - args.seed, args.output_path, - args.processes_count, + args.parallelism, + args.sentinel_path, ) if __name__ == "__main__": diff --git a/prepare_layers/make_restore_map.py b/prepare_layers/make_restore_map.py index cb8826e..a273b5f 100644 --- a/prepare_layers/make_restore_map.py +++ b/prepare_layers/make_restore_map.py @@ -1,15 +1,13 @@ import argparse import itertools +import os +from contextlib import ExitStack, nullcontext from pathlib import Path from typing import Dict, List, Optional import pandas as pd -import yirgacheffe.operators as yo +import yirgacheffe as yg from alive_progress import alive_bar -from yirgacheffe.layers import RasterLayer, RescaledRasterLayer - -from osgeo import gdal -gdal.SetCacheMax(1 * 1024 * 1024 * 1024) # From Eyres et al: In the restoration scenario all areas classified as arable or pasture were restored to their PNV IUCN_CODE_REPLACEMENTS = [ @@ -30,33 +28,46 @@ def load_crosswalk_table(table_file_name: Path) -> Dict[str,List[int]]: result[row.code] = [int(row.value)] return result - def make_restore_map( pnv_path: Path, - current_path: Path, + current_dir_path: Path, crosswalk_path: Path, output_path: Path, - concurrency: Optional[int], + parallelism: Optional[int], show_progress: bool, ) -> None: - with RasterLayer.layer_from_file(current_path) as current: - with RescaledRasterLayer.layer_from_file(pnv_path, current.pixel_scale) as pnv: - crosswalk = load_crosswalk_table(crosswalk_path) + os.makedirs(output_path, exist_ok=True) + + crosswalk = load_crosswalk_table(crosswalk_path) + + map_replacement_codes = list(itertools.chain.from_iterable([crosswalk[x] for x in IUCN_CODE_REPLACEMENTS])) + ideal_map_replacement_filenames = [current_dir_path / f"lcc_{code}.tif" for code in map_replacement_codes] + map_replacement_filenames = [path for path in ideal_map_replacement_filenames if path.exists()] + + with ExitStack() as stack: + replacement_maps = [stack.enter_context(yg.read_raster(filename)) for filename in map_replacement_filenames] + replacement_total = yg.sum(replacement_maps) - map_replacement_codes = list(itertools.chain.from_iterable([crosswalk[x] for x in IUCN_CODE_REPLACEMENTS])) - restore_map = yo.where(current.isin(map_replacement_codes), pnv, current) + # all the ones we expect to be left with, but not the ones we're removing + current_raster_filenames = [ + path for path in current_dir_path.glob("*.tif") if path not in map_replacement_filenames + ] - with RasterLayer.empty_raster_layer_like( - restore_map, - filename=output_path, - threads=16 - ) as result: - if show_progress: - with alive_bar(manual=True) as bar: - restore_map.parallel_save(result, callback=bar, parallelism=concurrency) - else: - restore_map.parallel_save(result, parallelism=concurrency) + # Read the PNV as the same scale as the other maps + with yg.read_raster_like(pnv_path, replacement_maps[0], yg.ResamplingMethod.Nearest) as pnv: + for filename in current_raster_filenames: + lcc_code = int(filename.stem.split('_')[1]) + ctx = alive_bar(manual=True, title=str(lcc_code)) if show_progress else nullcontext() + with ctx as bar: + with yg.read_raster(filename) as layer: + updated_layer = layer + (replacement_total * (pnv == lcc_code).astype(yg.DataType.Float32)) + capped_updated_layer = yg.where(updated_layer > 1, 1.0, updated_layer) + capped_updated_layer.to_geotiff( + output_path / f"lcc_{lcc_code}.tif", + callback=bar, + parallelism=parallelism, + ) def main() -> None: parser = argparse.ArgumentParser(description="Zenodo resource downloader.") @@ -70,9 +81,9 @@ def main() -> None: parser.add_argument( '--current', type=Path, - help='Path of current map', + help='Path of current maps', required=True, - dest='current_path', + dest='current_dir_path', ) parser.add_argument( '--crosswalk', @@ -91,10 +102,10 @@ def main() -> None: parser.add_argument( '-j', type=int, - help='Number of concurrent threads to use for calculation.', + help='Number of parallel threads to use for calculation.', required=False, default=None, - dest='concurrency', + dest='parallelism', ) parser.add_argument( '-p', @@ -108,10 +119,10 @@ def main() -> None: make_restore_map( args.pnv_path, - args.current_path, + args.current_dir_path, args.crosswalk_path, args.results_path, - args.concurrency, + args.parallelism, args.show_progress, ) diff --git a/prepare_species/common.py b/prepare_species/common.py index 6869d77..48a28f0 100644 --- a/prepare_species/common.py +++ b/prepare_species/common.py @@ -212,7 +212,7 @@ def tidy_reproject_save( graw = gdf.loc[0].copy() grow = aoh.tidy_data(graw) # type: ignore - output_path = output_directory_path / f"{grow.id_no}_{grow.season}.geojson" + output_path = output_directory_path / f"range_T{grow.id_no}A{grow.assessment_id}_{grow.season}.geojson" res = gpd.GeoDataFrame(grow.to_frame().transpose(), crs=src_crs, geometry="geometry") res_projected = res.to_crs(target_crs) report.filename = output_path diff --git a/requirements.txt b/requirements.txt index 0da34ab..20c3fc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,12 +14,18 @@ rasterio requests alive-progress matplotlib -yirgacheffe -aoh -gdal[numpy] +yirgacheffe>=1.12.7,<2.0 +aoh>=2.1.2,<3.0 +gdal[numpy]==3.12.2 git+https://github.com/quantifyearth/iucn_modlib.git git+https://github.com/quantifyearth/seasonality + +# Snakemake workflow management +snakemake>=8.0 +# We require a bug fix on this package that isn't yet upstream +snakemake-argparse-bridge + # developer requirements pytest pylint @@ -28,3 +34,4 @@ pandas-stubs types-requests yirgacheffe[dev] aoh[dev] +snakefmt diff --git a/scripts/generate_food_map.sh b/scripts/generate_food_map.sh deleted file mode 100755 index 9d93a88..0000000 --- a/scripts/generate_food_map.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -# -# This script will generate the updated habitat map for the FOOD paper based on the original Jung habitat map. -# Note that this is not derived directly from the Jung map, but rather from the simplified version used in LIFE, -# which has all habitats at level 1 except anthropomorphic ones at level 2. As such this script assumes you have -# downloaded and generated `current_raw.tif` from the original LIFE pipeline (see run.sh) - -set -e -set -x - -if [ -z "${DATADIR}" ]; then - echo "Please specify $DATADIR" - exit 1 -fi - -if [ -z "${VIRTUAL_ENV}" ]; then - echo "Please specify run in a virtualenv" - exit 1 -fi - -if [ ! -f "${DATADIR}"/habitat/current_raw.tif ]; then - echo "LIFE Level 1 current map required" - exit 1 -fi - -if [ ! -f "${DATADIR}"/habitat/pnv_raw.tif ]; then - echo "Jung PNV map required" - exit 1 -fi - -if [ ! -d "${DATADIR}"/food ]; then - mkdir -p "${DATADIR}"/food -fi - -# Get GAEZ data -if [ ! -f "${DATADIR}"/food/GLCSv11_02_5m.tif ]; then - if [ ! -f "${DATADIR}"/food/LR.zip ]; then - curl -o "${DATADIR}"/food/LR.zip https://s3.eu-west-1.amazonaws.com/data.gaezdev.aws.fao.org/LR.zip - fi - unzip -j "${DATADIR}"/food/LR.zip LR/lco/GLCSv11_02_5m.tif -d "${DATADIR}"/food/ -fi - -# Get HYDE 3.2 data -if [ ! -f "${DATADIR}"/food/modified_grazing2017AD.asc ]; then - if [ ! -f "${DATADIR}"/food/baseline.zip ]; then - curl -o "${DATADIR}"/food/baseline.zip "https://geo.public.data.uu.nl/vault-hyde/HYDE%203.2%5B1710494848%5D/original/baseline.zip" - fi - if [ ! -f "${DATADIR}"/food/2017AD_lu.zip ]; then - unzip -j "${DATADIR}"/food/baseline.zip baseline/zip/2017AD_lu.zip -d "${DATADIR}"/food/ - fi - if [ ! -f "${DATADIR}"/food/grazing2017AD.asc ]; then - unzip -j "${DATADIR}"/food/2017AD_lu.zip grazing2017AD.asc -d "${DATADIR}"/food/ - fi - # The pixel scale in the two files have different rounding despite covering the same area - # and so this makes them align. - sed "s/0.0833333/0.08333333333333333/" "${DATADIR}"/food/grazing2017AD.asc > "${DATADIR}"/food/modified_grazing2017AD.asc -fi - -if [ ! -f "${DATADIR}"/food/modified_grazing2017AD.prj ]; then - echo 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]' > "${DATADIR}"/food/modified_grazing2017AD.prj -fi - -# We need rescaled versions of the current data -if [ ! -d "${DATADIR}"/food/current_layers ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/current_raw.tif \ - --scale 0.08333333333333333 \ - --output "${DATADIR}"/food/current_layers/ -fi - -# Combine GAEZ and HYDE data -if [ ! -f "${DATADIR}"/food/crop.tif ] || [ ! -f "${DATADIR}"/food/pasture.tif ]; then - python3 ./prepare_layers/build_gaez_hyde.py --gaez "${DATADIR}"/food/GLCSv11_02_5m.tif \ - --hyde "${DATADIR}"/food/modified_grazing2017AD.asc \ - --output "${DATADIR}"/food/ -fi - -if [ ! -f "${DATADIR}"/food/crop_diff.tif ]; then - python3 ./utils/raster_diff.py --raster_a "${DATADIR}"/food/crop.tif \ - --raster_b "${DATADIR}"/food/current_layers/lcc_1401.tif \ - --output "${DATADIR}"/food/crop_diff.tif -fi - -if [ ! -f "${DATADIR}"/food/pasture_diff.tif ]; then - python3 ./utils/raster_diff.py --raster_a "${DATADIR}"/food/pasture.tif \ - --raster_b "${DATADIR}"/food/current_layers/lcc_1402.tif \ - --output "${DATADIR}"/food/pasture_diff.tif -fi - -if [ ! -f "${DATADIR}"/food/current_raw.tif ]; then - python3 ./prepare_layers/make_food_current_map.py --current_lvl1 "${DATADIR}"/habitat/current_raw.tif \ - --pnv "${DATADIR}"/habitat/pnv_raw.tif \ - --crop_diff "${DATADIR}"/food/crop_diff.tif \ - --pasture_diff "${DATADIR}"/food/pasture_diff.tif \ - --seed 42 \ - --output "${DATADIR}"/food/current_raw.tif -fi diff --git a/scripts/run.sh b/scripts/run.sh deleted file mode 100755 index 958bd8f..0000000 --- a/scripts/run.sh +++ /dev/null @@ -1,326 +0,0 @@ -#!/bin/bash -# -# Assumes you've set up a python virtual environement in the current directory. -# -# In addition to the Python environemnt, you will need the following extra command line tools: -# -# https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly -# https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel - -# Set shell script to exit on first error (-e) and to output commands being run to make -# reviewing logs easier (-x) -set -e -set -x - -# We know we use two Go tools, so add go/bin to our path as in slurm world they're likely -# to be installed locally -export PATH="${PATH}":"${HOME}"/go/bin -if ! hash littlejohn 2>/dev/null; then - echo "Please ensure littlejohn is available" - exit 1 -fi -if ! hash reclaimer 2>/dev/null; then - echo "Please ensure reclaimer is available" - exit 1 -fi - -# Detect if we're running under SLURM -if [[ -n "${SLURM_JOB_ID}" ]]; then - # Slurm users will probably need to customise this - # shellcheck disable=SC1091 - source "${HOME}"/venvs/life/bin/activate - cd "${HOME}"/dev/life - PROCESS_COUNT="${SLURM_JOB_CPUS_PER_NODE}" -else - PROCESS_COUNT=$(nproc --all) -fi - -if [ -z "${DATADIR}" ]; then - echo "Please specify $DATADIR" - exit 1 -fi - -if [ -z "${VIRTUAL_ENV}" ]; then - echo "Please specify run in a virtualenv" - exit 1 -fi - -# Experiment configuration. You can override these variables in the environment and it will use those -# values rather than the defaults specified here. -# shellcheck disable=SC2206 -declare -a SCENARIOS=(${SCENARIOS:-"arable" "restore" "restore_all" "urban" "pasture" "restore_agriculture"}) -# shellcheck disable=SC2206 -declare -a TAXAS=(${TAXAS:-"AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA"}) -export CURVE=${CURVE:0.25} -export PIXEL_SCALE=${PIXEL_SCALE:-0.016666666666667} - -check_scenario() { - local target="$1" - for scenario in "${SCENARIOS[@]}"; do - if [[ "$scenario" == "$target" ]]; then - return 0 - fi - done - return 1 -} - -if [ ! -f "${DATADIR}"/crosswalk.csv ]; then - python3 ./prepare_layers/generate_crosswalk.py --output "${DATADIR}"/crosswalk.csv -fi - -# Get habitat layer and prepare for use -if [ ! -f "${DATADIR}"/habitat/current_raw.tif ]; then - if [ ! -f "${DATADIR}"/habitat/jung_l2_raw.tif ]; then - reclaimer zenodo --zenodo_id 4058819 \ - --filename iucn_habitatclassification_composite_lvl2_ver004.zip \ - --extract \ - --output "${DATADIR}"/habitat/jung_l2_raw.tif - fi - - if [ ! -d "${DATADIR}"/habitat/lvl2_changemasks_ver004 ]; then - reclaimer zenodo --zenodo_id 4058819 \ - --filename lvl2_changemasks_ver004.zip \ - --extract \ - --output "${DATADIR}"/habitat/ - fi - - python3 ./prepare_layers/make_current_map.py --jung "${DATADIR}"/habitat/jung_l2_raw.tif \ - --update_masks "${DATADIR}"/habitat/lvl2_changemasks_ver004 \ - --crosswalk "${DATADIR}"/crosswalk.csv \ - --output "${DATADIR}"/habitat/current_raw.tif \ - -j 16 -fi - -if [ ! -d "${DATADIR}"/habitat_maps/current ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/current_raw.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/current/ -fi - -# Get PNV layer and prepare for use -if [ ! -f "${DATADIR}"/habitat/pnv_raw.tif ]; then - reclaimer zenodo --zenodo_id 4038749 \ - --filename pnv_lvl1_004.zip \ - --extract \ - --output "${DATADIR}"/habitat/pnv_raw.tif -fi - -if [ ! -d "${DATADIR}"/habitat_maps/pnv ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/pnv_raw.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/pnv/ -fi - -# Generate an area scaling map -if [ ! -f "${DATADIR}"/area-per-pixel.tif ]; then - python3 ./prepare_layers/make_area_map.py --scale "${PIXEL_SCALE}" --output "${DATADIR}"/area-per-pixel.tif -fi - -# Generate the arable scenario map -if check_scenario "arable"; then - if [ ! -f "${DATADIR}"/habitat/arable.tif ]; then - python3 ./prepare_layers/make_arable_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --output "${DATADIR}"/habitat/arable.tif - fi - - if [ ! -d "${DATADIR}"/habitat_maps/arable ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/arable.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/arable/ - fi - - if [ ! -f "${DATADIR}"/habitat/arable_diff_area.tif ]; then - python3 ./prepare_layers/make_diff_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --scenario "${DATADIR}"/habitat/arable.tif \ - --area "${DATADIR}"/area-per-pixel.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat/arable_diff_area.tif - fi -fi - -# Generate the pasture scenario map -if check_scenario "pasture"; then - if [ ! -f "${DATADIR}"/habitat/pasture.tif ]; then - python3 ./prepare_layers/make_pasture_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --output "${DATADIR}"/habitat/pasture.tif - fi - - if [ ! -d "${DATADIR}"/habitat_maps/pasture ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/pasture.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/pasture/ - fi - - if [ ! -f "${DATADIR}"/habitat/pasture_diff_area.tif ]; then - python3 ./prepare_layers/make_diff_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --scenario "${DATADIR}"/habitat/pasture.tif \ - --area "${DATADIR}"/area-per-pixel.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat/pasture_diff_area.tif - fi -fi - -# Generate the restore map -if check_scenario "restore"; then - if [ ! -f "${DATADIR}"/habitat/restore.tif ]; then - python3 ./prepare_layers/make_restore_map.py --pnv "${DATADIR}"/habitat/pnv_raw.tif \ - --current "${DATADIR}"/habitat/current_raw.tif \ - --crosswalk "${DATADIR}"/crosswalk.csv \ - --output "${DATADIR}"/habitat/restore.tif - fi - - if [ ! -d "${DATADIR}"/habitat_maps/restore ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/restore.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/restore/ - fi - - if [ ! -f "${DATADIR}"/habitat/restore_diff_area.tif ]; then - python3 ./prepare_layers/make_diff_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --scenario "${DATADIR}"/habitat/restore.tif \ - --area "${DATADIR}"/area-per-pixel.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat/restore_diff_area.tif - fi -fi - -# Generate the restore_agriculture map -if check_scenario "restore_agriculture"; then - if [ ! -f "${DATADIR}"/habitat/restore_agriculture.tif ]; then - python3 ./prepare_layers/make_restore_agriculture_map.py --pnv "${DATADIR}"/habitat/pnv_raw.tif \ - --current "${DATADIR}"/habitat/current_raw.tif \ - --crosswalk "${DATADIR}"/crosswalk.csv \ - --output "${DATADIR}"/habitat/restore_agriculture.tif - fi - - if [ ! -d "${DATADIR}"/habitat_maps/restore_agriculture ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/restore_agriculture.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/restore_agriculture/ - fi - - if [ ! -f "${DATADIR}"/habitat/restore_agriculture_diff_area.tif ]; then - python3 ./prepare_layers/make_diff_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --scenario "${DATADIR}"/habitat/restore_agriculture.tif \ - --area "${DATADIR}"/area-per-pixel.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat/restore_agriculture_diff_area.tif - fi -fi - -# Generate the restore all map -if check_scenario "restore_all"; then - if [ ! -f "${DATADIR}"/habitat/restore_all.tif ]; then - python3 ./prepare_layers/make_restore_all_map.py --pnv "${DATADIR}"/habitat/pnv_raw.tif \ - --current "${DATADIR}"/habitat/current_raw.tif \ - --crosswalk "${DATADIR}"/crosswalk.csv \ - --output "${DATADIR}"/habitat/restore_all.tif - fi - - if [ ! -d "${DATADIR}"/habitat_maps/restore_all ]; then - aoh-habitat-process --habitat "${DATADIR}"/habitat/restore_all.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat_maps/restore_all/ - fi - - if [ ! -f "${DATADIR}"/habitat/restore_all_diff_area.tif ]; then - python3 ./prepare_layers/make_diff_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --scenario "${DATADIR}"/habitat/restore_all.tif \ - --area "${DATADIR}"/area-per-pixel.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat/restore_all_diff_area.tif - fi -fi - -# Generate urban all map -if check_scenario "urban"; then - if [ ! -d "${DATADIR}"/habitat_maps/urban ]; then - python3 ./prepare_layers/make_constant_habitat.py --examplar "${DATADIR}"/habitat_maps/arable/lcc_1401.tif \ - --habitat_code 14.5 \ - --crosswalk "${DATADIR}"/crosswalk.csv \ - --output "${DATADIR}"/habitat_maps/urban - fi - - if [ ! -f "${DATADIR}"/habitat/urban_diff_area.tif ]; then - python3 ./prepare_layers/make_constant_diff_map.py --current "${DATADIR}"/habitat/current_raw.tif \ - --habitat_code 14.5 \ - --crosswalk "${DATADIR}"/crosswalk.csv \ - --area "${DATADIR}"/area-per-pixel.tif \ - --scale "${PIXEL_SCALE}" \ - --output "${DATADIR}"/habitat/urban_diff_area.tif - fi -fi - -# Fetch and prepare the elevation layers -if [ ! -f "${DATADIR}"/elevation.tif ]; then - reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation.tif -fi -if [ ! -f "${DATADIR}"/elevation-max.tif ]; then - gdalwarp -t_srs EPSG:4326 -tr "${PIXEL_SCALE}" -"${PIXEL_SCALE}" -r max -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation.tif "${DATADIR}"/elevation-max.tif -fi -if [ ! -f "${DATADIR}"/elevation-min.tif ]; then - gdalwarp -t_srs EPSG:4326 -tr "${PIXEL_SCALE}" -"${PIXEL_SCALE}" -r min -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation.tif "${DATADIR}"/elevation-min.tif -fi - -# Get species data per taxa from IUCN data -for TAXA in "${TAXAS[@]}" -do - if [ ! -d "${DATADIR}"/species-info/"${TAXA}"/ ]; then - if [ -f "${DATADIR}"/overrides.csv ]; then - python3 ./prepare_species/extract_species_psql.py --class "${TAXA}" \ - --output "${DATADIR}"/species-info/"${TAXA}"/ \ - --projection "EPSG:4326" \ - --overrides "${DATADIR}"/overrides.csv - else - python3 ./prepare_species/extract_species_psql.py --class "${TAXA}" \ - --output "${DATADIR}"/species-info/"${TAXA}"/ \ - --projection "EPSG:4326" - fi - fi -done - -# Generate the batch job input CSVs -python3 ./utils/speciesgenerator.py --datadir "${DATADIR}" \ - --output "${DATADIR}"/aohbatch.csv \ - --scenarios "${SCENARIOS[@]}" -python3 ./utils/persistencegenerator.py --datadir "${DATADIR}" \ - --curve "${CURVE}" \ - --output "${DATADIR}"/persistencebatch.csv \ - --scenarios "${SCENARIOS[@]}" - -# Calculate all the AoHs -littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/aohbatch.log -c "${DATADIR}"/aohbatch.csv "${VIRTUAL_ENV}"/bin/aoh-calc -- --force-habitat - -# Generate validation summaries -aoh-collate-data --aoh_results "${DATADIR}"/aohs/current/ --output "${DATADIR}"/aohs/current.csv -aoh-collate-data --aoh_results "${DATADIR}"/aohs/pnv/ --output "${DATADIR}"/aohs/pnv.csv -for SCENARIO in "${SCENARIOS[@]}" -do - aoh-collate-data --aoh_results "${DATADIR}"/aohs/"${SCENARIO}"/ --output "${DATADIR}"/aohs/"${SCENARIO}".csv -done - -# Calculate predictors from AoHs -aoh-species-richness --aohs_folder "${DATADIR}"/aohs/current/ \ - --output "${DATADIR}"/predictors/species_richness.tif -aoh-endemism --aohs_folder "${DATADIR}"/aohs/current/ \ - --species_richness "${DATADIR}"/predictors/species_richness.tif \ - --output "${DATADIR}"/predictors/endemism.tif - -# Calculate the per species Delta P values -littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/persistencebatch.log -c "${DATADIR}"/persistencebatch.csv "${VIRTUAL_ENV}"/bin/python3 -- ./deltap/global_code_residents_pixel.py - -for SCENARIO in "${SCENARIOS[@]}" -do - for TAXA in "${TAXAS[@]}" - do - python3 ./utils/raster_sum.py --rasters_directory "${DATADIR}"/deltap/"${SCENARIO}"/"${CURVE}"/"${TAXA}"/ --output "${DATADIR}"/deltap_sum/"${SCENARIO}"/"${CURVE}"/"${TAXA}".tif - done - - python3 ./utils/species_totals.py --deltaps "${DATADIR}"/deltap/"${SCENARIO}"/"${CURVE}"/ --output "${DATADIR}"/deltap/"${SCENARIO}"/"${CURVE}"/totals.csv - - # Generate final map - python3 ./deltap/delta_p_scaled.py --input "${DATADIR}"/deltap_sum/"${SCENARIO}"/"${CURVE}"/ \ - --diffmap "${DATADIR}"/habitat/"${SCENARIO}"_diff_area.tif \ - --totals "${DATADIR}"/deltap/"${SCENARIO}"/"${CURVE}"/totals.csv \ - --output "${DATADIR}"/deltap_final/scaled_"${SCENARIO}"_"${CURVE}".tif -done diff --git a/tests/test_food_layer.py b/tests/test_food_layer.py index 43edba9..42a4c97 100644 --- a/tests/test_food_layer.py +++ b/tests/test_food_layer.py @@ -1,196 +1,407 @@ +import math import numpy as np import pytest +from pytest import param as P import yirgacheffe as yg -from yirgacheffe.layers import RasterLayer -from yirgacheffe.operators import DataType -from yirgacheffe.window import Area, PixelScale - -from prepare_layers.make_food_current_map import TileInfo, process_tile - -@pytest.mark.parametrize("initial,crop_diff,pasture_diff,expected", [ - (42, float("nan"), float("nan"), 42), - - # Other habitat replacement - (42, 1.0, float("nan"), 1401), - (42, float("nan"), 1.0, 1402), - (42, 0.0, float("nan"), 42), - (42, float("nan"), 0.0, 42), - (42, -1.0, float("nan"), 42), - (42, float("nan"), -1.0, 42), - - # Crop replacement - (1401, 1.0, float("nan"), 1401), - (1401, float("nan"), 1.0, 1401), - (1401, 0.0, float("nan"), 1401), - (1401, float("nan"), 0.0, 1401), - (1401, -1.0, float("nan"), 31), - (1401, -1.0, 1.0, 1402), - (1401, float("nan"), -1.0, 1401), - - # Pasture replacement - (1402, 1.0, float("nan"), 1402), - (1402, 1.0, float("nan"), 1402), - (1402, float("nan"), 0.0, 1402), - (1402, 0.0, float("nan"), 1402), - (1402, float("nan"), -1.0, 31), - (1402, 1.0, -1.0, 1401), - (1402, -1.0, float("nan"), 1402), - - # Don't touch urban - (1405, 1.0, float("nan"), 1405), - (1405, float("nan"), 1.0, 1405), - (1405, 0.0, float("nan"), 1405), - (1405, float("nan"), 0.0, 1405), - (1405, -1.0, float("nan"), 1405), - (1405, -1.0, 1.0, 1405), - (1405, float("nan"), -1.0, 1405), +from prepare_layers.make_food_current_map import balance_crop_and_pasture_differences, \ + CROP_CODE, PASTURE_CODE, remove_land_cover, add_land_cover, TileInfo, process_tile, PRESERVE_CODES + +@pytest.mark.parametrize( + [ + "initial_crop_diff", + "initial_pasture_diff", + "expected_crop_diff", + "expected_pasture_diff" + ], + [ + (0.0, 0.0, 0.0, 0.0), + (0.8, 0.0, 0.8, 0.0), + (-0.8, 0.0, -0.8, 0.0), + (0.0, 0.8, 0.0, 0.8), + (0.0, -0.8, 0.0, -0.8), + (0.4, 0.2, 0.4, 0.2), + (-0.4, -0.2, -0.4, -0.2), + ] +) +def test_balance_not_needed( + initial_crop_diff: float, + initial_pasture_diff: float, + expected_crop_diff: float, + expected_pasture_diff: float, +) -> None: + result_crop_diff, result_pasture_diff = balance_crop_and_pasture_differences( + initial_crop_diff, + initial_pasture_diff, + {}, + ) + assert expected_crop_diff == result_crop_diff + assert expected_pasture_diff == result_pasture_diff + + +def test_brokend_balance() -> None: + # Testing that we spot if we've said to remove land where there isn't any + lcc_data_map = { + CROP_CODE: np.zeros((10, 10)), + PASTURE_CODE: np.zeros((10, 10)), + } + with pytest.raises(ValueError): + _ = balance_crop_and_pasture_differences( + 0.5, + -0.1, + lcc_data_map, + ) + + +@pytest.mark.parametrize( + [ + "initial_crop_diff", + "initial_pasture_diff", + "expected_crop_diff", + "expected_pasture_diff", + "crop_cell_value", + "pasture_cell_value" + ], + [ + (0.25, -0.5, 0.0, -0.25, 0.25, 0.75), + ] +) +def test_simple_balance_transfer( + initial_crop_diff: float, + initial_pasture_diff: float, + expected_crop_diff: float, + expected_pasture_diff: float, + crop_cell_value: float, + pasture_cell_value: float, +) -> None: + # 0% crop, 100% pasture + lcc_data_map = { + CROP_CODE: np.zeros((10, 10)), + PASTURE_CODE: np.ones((10, 10)), + } + result_crop_diff, result_pasture_diff = balance_crop_and_pasture_differences( + initial_crop_diff, + initial_pasture_diff, + lcc_data_map, + ) + assert expected_crop_diff == result_crop_diff + assert expected_pasture_diff == result_pasture_diff + assert (lcc_data_map[CROP_CODE] == crop_cell_value).all() + assert (lcc_data_map[PASTURE_CODE] == pasture_cell_value).all() + + +@pytest.mark.parametrize( + [ + "initial_crop_diff", + "initial_pasture_diff", + "expected_crop_diff", + "expected_pasture_diff", + "crop_cell_value", + "pasture_cell_value" + ], + [ + (0.25, -0.5, 0.0, -0.25, 0.5, 0.5), + (0.5, -0.25, 0.25, 0.0, 0.5, 0.5), + ] +) +def test_simple_half_balance_transfer( + initial_crop_diff: float, + initial_pasture_diff: float, + expected_crop_diff: float, + expected_pasture_diff: float, + crop_cell_value: float, + pasture_cell_value: float, +) -> None: + # 0% crop, 50% pasture + lcc_data_map = { + CROP_CODE: np.zeros((10, 10)), + PASTURE_CODE: np.array([[i % 2] * 10 for i in range(10)]).astype(float), + } + result_crop_diff, result_pasture_diff = balance_crop_and_pasture_differences( + initial_crop_diff, + initial_pasture_diff, + lcc_data_map, + ) + assert expected_crop_diff == result_crop_diff + assert expected_pasture_diff == result_pasture_diff + + expected_crop_map = np.array([[i % 2] * 10 for i in range(10)]).astype(float) * crop_cell_value + expected_pasture_map = np.array([[i % 2] * 10 for i in range(10)]).astype(float) * pasture_cell_value + assert (expected_crop_map == lcc_data_map[CROP_CODE]).all() + assert (expected_pasture_map == lcc_data_map[PASTURE_CODE]).all() + + +@pytest.mark.parametrize( + [ + "crop_diff", + "expected_crop_cell", + "expected_other_cell", + ], + [ + (-0.5, 0.5, 0.5), + ] +) +def test_remove_land_simple( + crop_diff: float, + expected_crop_cell: float, + expected_other_cell: float, +) -> None: + # 100% crop, 0% other 1 + lcc_data_map = { + 1: np.zeros((10, 10)), + CROP_CODE: np.ones((10, 10)), + } + pnv_data = np.full((10, 10), 1) + + remove_land_cover( + CROP_CODE, + crop_diff, + pnv_data, + lcc_data_map, + ) + + expected_crop_map = np.full((10, 10), expected_crop_cell) + expected_other_map = np.full((10, 10), expected_other_cell) + assert (expected_crop_map == lcc_data_map[CROP_CODE]).all() + assert (expected_other_map == lcc_data_map[1]).all() + + +@pytest.mark.parametrize( + [ + "crop_diff", + "pnv_value", + "expected_crop_cell", + "expected_other_1_cell", + "expected_other_2_cell", + ], + [ + (-0.5, 1, 0.0, 1.0, 0.0), + (-0.75, 1, 0.0, 1.0, 0.0), # too much + (-0.25, 1, 0.5, 0.5, 0.0), + (-0.5, 2, 0.0, 0.0, 1.0), + (-0.25, 2, 0.5, 0.0, 0.5), + ] +) +def test_remove_land_less_simple( + crop_diff: float, + pnv_value: int, + expected_crop_cell: float, + expected_other_1_cell: float, + expected_other_2_cell: float, +) -> None: + # 50% crop, 50% other 2, 0% other 1 + lcc_data_map = { + 1: np.zeros((10, 10)), + 2: np.array([[(i + 1) % 2] * 10 for i in range(10)]).astype(float), + CROP_CODE: np.array([[i % 2] * 10 for i in range(10)]).astype(float), + } + pnv_data = np.full((10, 10), pnv_value) + + remove_land_cover( + CROP_CODE, + crop_diff, + pnv_data, + lcc_data_map, + ) + + expected_crop_map = np.array([[i % 2] * 10 for i in range(10)]).astype(float) * expected_crop_cell + assert (expected_crop_map == lcc_data_map[CROP_CODE]).all() + expected_other_1_map = np.array([[i % 2] * 10 for i in range(10)]).astype(float) * expected_other_1_cell + assert (expected_other_1_map == lcc_data_map[1]).all() + expected_other_2_map = np.array([[(i + 1) % 2] * 10 for i in range(10)]).astype(float) + \ + (np.array([[i % 2] * 10 for i in range(10)]).astype(float) * expected_other_2_cell) + assert (expected_other_2_map == lcc_data_map[2]).all() + + +@pytest.mark.parametrize("crop_diff,expected_crop_cell,expected_other_cell", [ + (0.5, 0.5, 0.5), + (1.0, 1.0, 0.0), ]) -def test_process_tile_all(initial, crop_diff, pasture_diff, expected) -> None: - with RasterLayer.empty_raster_layer( - Area(-180, 90, 180, -90), - PixelScale(1.0, -1.0), - datatype=DataType.Int16, - ) as current: - yg.constant(initial).save(current) - with RasterLayer.empty_raster_layer( - Area(-180, 90, 180, -90), - PixelScale(1.0, -1.0), - datatype=DataType.Int16, - ) as pnv: - yg.constant(31).save(pnv) - - test_tile = TileInfo( - x_position=0, - y_position=0, - width=10, - height=12, - crop_diff=crop_diff, - pasture_diff=pasture_diff, - ) - - result = process_tile(current, pnv, test_tile, 42) - expected = np.full((12, 10), expected, dtype=np.int16) - assert (result == expected).all() - -@pytest.mark.parametrize("initial,crop_diff,pasture_diff,expected_crop_count,expected_pasture_count,expected_pnv_count", [ # pylint:disable=C0301 - (42, float("nan"), float("nan"), 0, 0, 0), - (42, 0.5, float("nan"), 50, 0, 0), - (42, float("nan"), 0.5, 0, 50, 0), - (42, -0.5, float("nan"), 0, 0, 0), - (42, float("nan"), -0.5, 0, 0, 0), - - (1401, float("nan"), float("nan"), 100, 0, 0), - (1401, -0.5, float("nan"), 50, 0, 50), - (1401, float("nan"), -0.5, 100, 0, 0), - - (1402, float("nan"), float("nan"), 0, 100, 0), - (1402, float("nan"), -0.5, 0, 50, 50), - (1402, -0.5, float("nan"), 0, 100, 0), +def test_add_land_simple( + crop_diff: float, + expected_crop_cell: float, + expected_other_cell: float, +) -> None: + # 100% crop, 0% other 1 + lcc_data_map = { + 1: np.ones((10, 10)), + CROP_CODE: np.zeros((10, 10)), + } + + add_land_cover( + np.ones((10, 10), dtype=bool), + [(crop_diff, CROP_CODE)], + lcc_data_map, + ) + + expected_crop_map = np.full((10, 10), expected_crop_cell) + expected_other_map = np.full((10, 10), expected_other_cell) + assert (expected_crop_map == lcc_data_map[CROP_CODE]).all() + assert (expected_other_map == lcc_data_map[1]).all() + + +@pytest.mark.parametrize("crop_diff,expected_crop_cell,expected_other_cell", [ + (0.25, 0.5, 0.5), + (0.5, 1.0, 0.0), ]) -def test_partial_replacement( - initial : int, - crop_diff : float, - pasture_diff : float, - expected_crop_count : int, - expected_pasture_count : int, - expected_pnv_count : int, +def test_add_land_avoid_excluded( + crop_diff: float, + expected_crop_cell: float, + expected_other_cell: float, ) -> None: - with RasterLayer.empty_raster_layer( - Area(-180, 90, 180, -90), - PixelScale(1.0, -1.0), - datatype=DataType.Int16, - ) as current: - yg.constant(initial).save(current) - with RasterLayer.empty_raster_layer( - Area(-180, 90, 180, -90), - PixelScale(1.0, -1.0), - datatype=DataType.Int16, - ) as pnv: - yg.constant(31).save(pnv) - - test_tile = TileInfo( - x_position=0, - y_position=0, - width=10, - height=10, - crop_diff=crop_diff, - pasture_diff=pasture_diff, - ) - - result = process_tile(current, pnv, test_tile, 42) - crop_count = (result == 1401).sum() - assert crop_count == expected_crop_count - pasture_count = (result == 1402).sum() - assert pasture_count == expected_pasture_count - pnv_count = (result == 31).sum() - assert pnv_count == expected_pnv_count - -@pytest.mark.parametrize("initial,crop_diff,pasture_diff,expected_crop_count,expected_pasture_count,expected_pnv_count", [ # pylint:disable=C0301 - (42, float("nan"), float("nan"), 0, 0, 0), - (42, 0.5, float("nan"), 50, 0, 0), - (42, float("nan"), 0.5, 0, 50, 0), - (42, -0.5, float("nan"), 0, 0, 0), - (42, float("nan"), -0.5, 0, 0, 0), - - (1401, float("nan"), float("nan"), 50, 0, 0), - (1401, 1.0, float("nan"), 100, 0, 0), - (1401, 0.5, float("nan"), 100, 0, 0), - (1401, 0.1, float("nan"), 60, 0, 0), - (1401, -0.1, float("nan"), 40, 0, 10), - (1401, -0.5, float("nan"), 0, 0, 50), - (1401, -1.0, float("nan"), 0, 0, 50), - (1401, float("nan"), 1.0, 50, 50, 0), - - (1405, float("nan"), float("nan"), 0, 0, 0), - (1405, 1.0, float("nan"), 50, 0, 0), - (1405, 0.5, float("nan"), 50, 0, 0), - (1405, 0.1, float("nan"), 10, 0, 0), - (1405, -0.1, float("nan"), 0, 0, 0), - (1405, -0.5, float("nan"), 0, 0, 0), - (1405, -1.0, float("nan"), 0, 0, 0), + # 100% crop, 0% other 1 + lcc_data_map = { + 1: np.array([[(i + 1) % 2] * 10 for i in range(10)]).astype(float), + PASTURE_CODE: np.array([[i % 2] * 10 for i in range(10)]).astype(float), + CROP_CODE: np.zeros((10, 10)), + } + + add_land_cover( + np.array([[(i + 1) % 2] * 10 for i in range(10)]).astype(bool), + [(crop_diff, CROP_CODE)], + lcc_data_map, + ) + + expected_crop_map = np.array([[(i + 1) % 2] * 10 for i in range(10)]).astype(float) * expected_crop_cell + expected_pasture_map = np.array([[i % 2] * 10 for i in range(10)]).astype(float) # unchanged + expected_other_map = np.array([[(i + 1) % 2] * 10 for i in range(10)]).astype(float) * expected_other_cell + assert (expected_crop_map == lcc_data_map[CROP_CODE]).all() + assert (expected_pasture_map == lcc_data_map[PASTURE_CODE]).all() + assert (expected_other_map == lcc_data_map[1]).all() + + +@pytest.mark.parametrize(["crop_diff", "pasture_diff", "expected_totals"], [ + P(0.25, 0.25, {1: 25, CROP_CODE: 25, PASTURE_CODE: 25, 4: 25}, id="no change"), + P(0.0, 0.0, {1: 75, CROP_CODE: 0, PASTURE_CODE: 0, 4: 25}, id="remove both"), + P(0.25, 0.0, {1: 50, CROP_CODE: 25, PASTURE_CODE: 0, 4: 25}, id="remove pasture, leave crop"), + P(0.0, 0.25, {1: 50, CROP_CODE: 0, PASTURE_CODE: 25, 4: 25}, id="remove crop, leave pasture"), + P(0.5, 0.25, {1: 12.5, CROP_CODE: 50, PASTURE_CODE: 25, 4: 12.5}, id="add crop, leave pasture"), + P(0.25, 0.5, {1: 12.5, CROP_CODE: 25, PASTURE_CODE: 50, 4: 12.5}, id="leave crop, add pasture"), + P(0.5, 0.2, {1: 15, CROP_CODE: 50, PASTURE_CODE: 20, 4: 15}, id="add crop, remove pasture, more add than remove"), + P(0.2, 0.5, {1: 15, CROP_CODE: 20, PASTURE_CODE: 50, 4: 15}, id="remove crop, add pasture, more add than remove"), + P(0.3, 0.1, {1: 35, CROP_CODE: 30, PASTURE_CODE: 10, 4: 25}, id="add crop, remove pasture, more remove than add"), + P(0.1, 0.3, {1: 35, CROP_CODE: 10, PASTURE_CODE: 30, 4: 25}, id="remove crop, add pasture, more remove than add"), + P(0.3, 0.3, {1: 20, CROP_CODE: 30, PASTURE_CODE: 30, 4: 20}, id="all both, but not total"), + P(0.5, 0.5, {1: 0, CROP_CODE: 50, PASTURE_CODE: 50, 4: 0}, id="all both"), + P(0.0, 0.5, {1: 25, CROP_CODE: 0, PASTURE_CODE: 50, 4: 25}, id="replace crop with pasture"), + P(0.5, 0.0, {1: 25, CROP_CODE: 50, PASTURE_CODE: 0, 4: 25}, id="replace pasture with crop"), + P(0.0, 1.0, {1: 0, CROP_CODE: 0, PASTURE_CODE: 100, 4: 0}, id="All pasture, all the time"), + P(1.0, 0.0, {1: 0, CROP_CODE: 100, PASTURE_CODE: 0, 4: 0}, id="All crop, all the time"), ]) -def test_partial_initial_tile( - initial : int, - crop_diff : float, - pasture_diff : float, - expected_crop_count : int, - expected_pasture_count : int, - expected_pnv_count : int, +def test_process_tile(crop_diff: float, pasture_diff: float, expected_totals: dict[int, float]) -> None: + # One quarter neither, one quarter crop, one quarter pasture, one quarter other neither + data = np.ones((5, 5)) + raw_lcc_data_map = { + 1: np.pad(data, ((0, 5), (0, 5))), + CROP_CODE: np.pad(data, ((0, 5), (5, 0))), + PASTURE_CODE: np.pad(data, ((5, 0), (0, 5))), + 4: np.pad(data, ((5, 0), (5, 0))), + } + projection = yg.MapProjection("epsg:4326", 1.0, -1.0) + lcc_maps = {lcc: yg.from_array(x, (0, 0), projection) for lcc, x in raw_lcc_data_map.items()} + + # The pnv is just class 1, so that is always the category that will increase, class 4 will only + # only go below its initial value or be the same + pnv_data = np.ones((10, 10)) # All class 1 + pnv_map = yg.from_array(pnv_data, (0, 0), projection) + + tile = TileInfo( + x_position=0, + y_position=0, + width=10, + height=10, + crop_target=crop_diff, + pasture_target=pasture_diff, + ) + + updated_maps = process_tile(lcc_maps, pnv_map, tile) + + for key, expected_value in expected_totals.items(): + layer = updated_maps[key] + layer_total = np.sum(layer) + assert layer_total >= 0 + assert math.isclose(layer_total, expected_value, rel_tol=0.000001), f"Failed for {key}" + + +@pytest.mark.parametrize(["crop_diff", "pasture_diff", "expected_totals"], [ + P(0.25, 0.25, {1: 25, CROP_CODE: 25, PASTURE_CODE: 25}, id="no change"), + P(0.0, 0.0, {1: 75, CROP_CODE: 0, PASTURE_CODE: 0}, id="remove both"), + P(0.25, 0.0, {1: 50, CROP_CODE: 25, PASTURE_CODE: 0}, id="remove pasture, leave crop"), + P(0.0, 0.25, {1: 50, CROP_CODE: 0, PASTURE_CODE: 25}, id="remove crop, leave pasture"), + P(0.5, 0.25, {1: 0, CROP_CODE: 50, PASTURE_CODE: 25}, id="add crop, leave pasture"), + P(0.25, 0.5, {1: 0, CROP_CODE: 25, PASTURE_CODE: 50}, id="leave crop, add pasture"), + P(0.5, 0.2, {1: 5, CROP_CODE: 50, PASTURE_CODE: 20}, id="add crop, remove pasture, more add than remove"), + P(0.2, 0.5, {1: 5, CROP_CODE: 20, PASTURE_CODE: 50}, id="remove crop, add pasture, more add than remove"), + P(0.3, 0.1, {1: 35, CROP_CODE: 30, PASTURE_CODE: 10}, id="add crop, remove pasture, more remove than add"), + P(0.1, 0.3, {1: 35, CROP_CODE: 10, PASTURE_CODE: 30}, id="remove crop, add pasture, more remove than add"), + P(0.3, 0.3, {1: 15, CROP_CODE: 30, PASTURE_CODE: 30}, id="all both, but not total"), + P(0.5, 0.5, {1: 0, CROP_CODE: 37.5, PASTURE_CODE: 37.5}, id="all both (unobtainable)"), + P(0.0, 0.5, {1: 25, CROP_CODE: 0, PASTURE_CODE: 50}, id="replace crop with pasture"), + P(0.5, 0.0, {1: 25, CROP_CODE: 50, PASTURE_CODE: 0}, id="replace pasture with crop"), + P(0.0, 1.0, {1: 0, CROP_CODE: 0, PASTURE_CODE: 75}, id="All pasture, all the time"), + P(1.0, 0.0, {1: 0, CROP_CODE: 75, PASTURE_CODE: 0}, id="All crop, all the time"), + P(0.6, 0.2, {1: 0, CROP_CODE: 55, PASTURE_CODE: 20}, id="Less pasture, but too much crop demand"), + P(0.2, 0.6, {1: 0, CROP_CODE: 20, PASTURE_CODE: 55}, id="Less crop, but too much pasture demand"), +]) +def test_process_tile_with_preserve_codes( + crop_diff: float, + pasture_diff: float, + expected_totals: dict[int, float] ) -> None: - with RasterLayer.empty_raster_layer( - Area(-180, 90, 180, -90), - PixelScale(1.0, -1.0), - datatype=DataType.Int16, - ) as current: - - # Cheating as Yirgacheffe doesn't have a neat way to provide numpy data straight to a layer - band = current._dataset.GetRasterBand(1) - vals = np.array([[initial, 22], [22, initial]]) - all_vals = np.tile(vals, (90, 180)) - band.WriteArray(all_vals, 0, 0) - - with RasterLayer.empty_raster_layer( - Area(-180, 90, 180, -90), - PixelScale(1.0, -1.0), - datatype=DataType.Int16, - ) as pnv: - yg.constant(31).save(pnv) - - test_tile = TileInfo( - x_position=0, - y_position=0, - width=10, - height=10, - crop_diff=crop_diff, - pasture_diff=pasture_diff, - ) - - result = process_tile(current, pnv, test_tile, 42) - crop_count = (result == 1401).sum() - assert crop_count == expected_crop_count - pasture_count = (result == 1402).sum() - assert pasture_count == expected_pasture_count - pnv_count = (result == 31).sum() - assert pnv_count == expected_pnv_count + # One quarter neither, one quarter crop, one quarter pasture, one quarter preserved + data = np.ones((5, 5)) + raw_lcc_data_map = { + 1: np.pad(data, ((0, 5), (0, 5))), + CROP_CODE: np.pad(data, ((0, 5), (5, 0))), + PASTURE_CODE: np.pad(data, ((5, 0), (0, 5))), + PRESERVE_CODES[0]: np.pad(data, ((5, 0), (5, 0))), + } + projection = yg.MapProjection("epsg:4326", 1.0, -1.0) + lcc_maps = {lcc: yg.from_array(x, (0, 0), projection) for lcc, x in raw_lcc_data_map.items()} + + # The pnv is just class 1, so that is always the category that will increase + pnv_data = np.ones((10, 10)) # All class 1 + pnv_map = yg.from_array(pnv_data, (0, 0), projection) + + tile = TileInfo( + x_position=0, + y_position=0, + width=10, + height=10, + crop_target=crop_diff, + pasture_target=pasture_diff, + ) + + updated_maps = process_tile(lcc_maps, pnv_map, tile) + + # This quarter of the tile should never be modified + expected_totals[PRESERVE_CODES[0]] = 25 + + for key, expected_value in expected_totals.items(): + layer = updated_maps[key] + layer_total = np.sum(layer) + assert layer_total >= 0 + assert math.isclose(layer_total, expected_value, rel_tol=0.000001), f"Failed for {key}" + + +def test_ensure_no_negative_values() -> None: + # This test case is something that was seen in real world use: although the math + # generally should prevent us going negative, it does happen due to rounding. This + # test case tries to force a situation where we'd get a negative value, and we + # want to ensure it's clipped to zero instead. + shape = (4,) + eligible_mask = np.array([True, True, False, False]) + + crop = np.zeros(shape) + other = np.array([0.05, 0.05, 0.8, 0.8]) + + lcc_data_map = { + CROP_CODE: crop, + 1: other, + } + + add_land_cover(eligible_mask, [(0.1, CROP_CODE)], lcc_data_map) + + for lcc_data in lcc_data_map.values(): + assert (lcc_data >= 0).all() diff --git a/utils/binary_maps.py b/utils/binary_maps.py index f0857b5..a79ee51 100644 --- a/utils/binary_maps.py +++ b/utils/binary_maps.py @@ -1,45 +1,40 @@ import argparse import os from multiprocessing import cpu_count +from pathlib import Path -import numpy as np -from osgeo import gdal -from yirgacheffe.layers import RasterLayer +import yirgacheffe as yg -layers = ["all", "AMPHIBIANS", "AVES", "MAMMALIA", "REPTILES"] -# layers = ["AMPHIBIANS", "AVES", "MAMMALIA", "REPTILES"] +LAYERS = ["all", "AMPHIBIANS", "AVES", "MAMMALIA", "REPTILES"] +# LAYERS = ["AMPHIBIANS", "AVES", "MAMMALIA", "REPTILES"] def binary_maps( - map_path: str, - output_path: str, + map_path: Path, + output_path: Path, parallelism: int ) -> None: - output_dir, filename = os.path.split(output_path) - base, _ext = os.path.splitext(filename) - os.makedirs(output_dir, exist_ok=True) + os.makedirs(output_path.parent, exist_ok=True) - for index, layername in enumerate(layers): - with RasterLayer.layer_from_file(map_path, band=index+1) as inputmap: - with RasterLayer.empty_raster_layer_like( - inputmap, - filename=os.path.join(output_dir, f"{base}_{layername}_binary.tif"), - datatype=gdal.GDT_Int16, - ) as result: - calc = inputmap.numpy_apply(lambda c: np.where(c != 0, c / np.abs(c), 0)) - calc.parallel_save(result, parallelism=parallelism) + for index, layername in enumerate(LAYERS): + with yg.read_raster(map_path, band=index+1) as inputmap: + binary_map = yg.where(inputmap != 0, inputmap / yg.abs(inputmap), 0) + binary_map.astype(yg.DataType.Int16).to_geotiff( + output_path.parent / f"{output_path.stem}_{layername}_binary.tif", + parallelism=parallelism, + ) def main() -> None: parser = argparse.ArgumentParser(description="Converts the output maps to binary") parser.add_argument( "--input", - type=str, + type=Path, required=True, dest="input_filename", help="multilayer result geotiff" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_filename", help="Destination geotiff path." diff --git a/utils/footprint_of_humanity.py b/utils/footprint_of_humanity.py index 7399833..77d60a6 100644 --- a/utils/footprint_of_humanity.py +++ b/utils/footprint_of_humanity.py @@ -24,7 +24,7 @@ def absolute( merge1 = pd.merge(current_cleaned, pnv_cleaned, on=["id_no", "season"], suffixes=["_current", "_pnv"]) merge2 = pd.merge(merge1, scenario, on=["id_no", "season"], how="left", indicator=True) merged = merge2[["id_no", "season", "class_name", "aoh_total_current", "aoh_total_pnv", "aoh_total", "_merge"]].copy() - merged.aoh_total = merged.aoh_total.fillna(0) + merged["aoh_total"] = merged.aoh_total.fillna(0) merged.rename(columns={"aoh_total": "aoh_total_scenario"}, inplace=True) merged["current_persistence"] = (merged.aoh_total_current / merged.aoh_total_pnv) ** 0.25 diff --git a/utils/habitat_stats.py b/utils/habitat_stats.py index 9221cf3..0cdb49e 100644 --- a/utils/habitat_stats.py +++ b/utils/habitat_stats.py @@ -4,36 +4,33 @@ from pathlib import Path import pandas as pd -from yirgacheffe.layers import RasterLayer, UniformAreaLayer - -from osgeo import gdal -gdal.SetCacheMax(1024 * 1024 * 16) +import yirgacheffe as yg def habitat_stats( habitats_dir: Path, - area_map_path: Path, output_dir: Path, process_count: int, ) -> None: - with UniformAreaLayer.layer_from_file(area_map_path) as area_raster: - os.makedirs(output_dir, exist_ok=True) - for scenario in Path(habitats_dir).iterdir(): - res = [] - for habitat in scenario.glob("*.tif"): - # Filenames have the form "lcc_200.tif" - we want the IUCN habitat class 2 or 14.1 etc." - basename, _ = os.path.splitext(habitat.name) - jung_habitat_class = int(basename.split("_")[1]) - if jung_habitat_class == 0: - continue - with RasterLayer.layer_from_file(str(habitat)) as raster: - raw_area = raster.parallel_sum(parallelism=process_count) - scaled_area_calc = raster * area_raster - scaled_area = scaled_area_calc.parallel_sum(parallelism=process_count) + os.makedirs(output_dir, exist_ok=True) + for scenario in habitats_dir.iterdir(): + res = [] + for habitat in scenario.glob("*.tif"): + # Filenames have the form "lcc_200.tif" - we want the IUCN habitat class 2 or 14.1 etc." + jung_habitat_class = int(habitat.stem.split("_")[1]) + if jung_habitat_class == 0: + continue + with ( + yg.read_raster(habitat) as raster, + yg.area_raster(raster.map_projection) as area_raster, + ): + raw_area = raster.parallel_sum(parallelism=process_count) + scaled_area_calc = raster * area_raster + scaled_area = scaled_area_calc.parallel_sum(parallelism=process_count) - res.append([jung_habitat_class, raw_area, scaled_area]) - df = pd.DataFrame(res, columns=["habitat", "pixel area", "geo area"]) - df = df.sort_values("habitat") - df.to_csv(output_dir / f"{scenario.name}.csv", index=False) + res.append([jung_habitat_class, raw_area, scaled_area]) + df = pd.DataFrame(res, columns=["habitat", "pixel area", "geo area"]) + df = df.sort_values("habitat") + df.to_csv(output_dir / f"{scenario.name}.csv", index=False) def main() -> None: parser = argparse.ArgumentParser(description="Generate stats on the habitat makeup of each scenario") @@ -43,12 +40,6 @@ def main() -> None: required=True, dest="habitats_dir", ) - parser.add_argument( - "--area", - type=Path, - required=True, - dest="area_map_path", - ) parser.add_argument( "--output", type=Path, @@ -68,7 +59,6 @@ def main() -> None: habitat_stats( args.habitats_dir, - args.area_map_path, args.output_dir, args.process_count, ) diff --git a/utils/persistencegenerator.py b/utils/persistencegenerator.py index a709246..b95bf43 100644 --- a/utils/persistencegenerator.py +++ b/utils/persistencegenerator.py @@ -24,11 +24,12 @@ def species_generator( res = [] for taxa in taxas: taxa_path = species_info_dir / taxa / "current" - speciess = list(taxa_path.glob("*.geojson")) + speciess = [x.stem.split('_') for x in taxa_path.glob("*.geojson")] for scenario in scenarios: - for species in speciess: + for taxid, season in speciess: res.append([ - species, + taxid, + season, aohs_path / "current" / taxa, aohs_path / scenario / taxa, aohs_path / "pnv" / taxa, @@ -37,7 +38,8 @@ def species_generator( ]) df = pd.DataFrame(res, columns=[ - '--speciesdata', + '--taxid', + '--season', '--current_path', '--scenario_path', '--historic_path', diff --git a/utils/raster_diff.py b/utils/raster_diff.py index d61055a..6c52f9b 100644 --- a/utils/raster_diff.py +++ b/utils/raster_diff.py @@ -3,6 +3,7 @@ from pathlib import Path import yirgacheffe as yg +from snakemake_argparse_bridge import snakemake_compatible # type: ignore def raster_diff( raster_a_path: Path, @@ -10,12 +11,18 @@ def raster_diff( output_path: Path, ) -> None: os.makedirs(output_path.parent, exist_ok=True) + with ( + yg.read_raster(raster_a_path) as raster_a, + yg.read_raster(raster_b_path) as raster_b + ): + result = raster_a - raster_b + result.to_geotiff(output_path, parallelism=True) - with yg.read_raster(raster_a_path) as raster_a: - with yg.read_raster(raster_b_path) as raster_b: - result = raster_a - raster_b - result.to_geotiff(output_path, parallelism=True) - +@snakemake_compatible(mapping={ + "raster_a_path": "input.raster_a", + "raster_b_path": "input.raster_b", + "output_path": "output.diff", +}) def main() -> None: parser = argparse.ArgumentParser(description="Calculates the difference between two rasters") parser.add_argument( @@ -30,13 +37,13 @@ def main() -> None: type=Path, required=True, dest="raster_b_path", - help="Right hands side of the difference" + help="Right hand side of the difference" ) parser.add_argument( "--output", type=Path, required=True, - dest="output_filename", + dest="output_path", help="Destination geotiff file for results." ) args = parser.parse_args() @@ -44,7 +51,7 @@ def main() -> None: raster_diff( args.raster_a_path, args.raster_b_path, - args.output_filename, + args.output_path, ) if __name__ == "__main__": diff --git a/utils/raster_sum.py b/utils/raster_sum.py index 40b0592..c52469b 100644 --- a/utils/raster_sum.py +++ b/utils/raster_sum.py @@ -1,117 +1,29 @@ import argparse -import os -import queue import resource -import sys -import tempfile -import time from pathlib import Path -from multiprocessing import Manager, Process, Queue, cpu_count -from typing import Optional -from yirgacheffe.layers import RasterLayer # type: ignore -from yirgacheffe.operators import DataType - -def worker( - compress: bool, - filename: str, - result_dir: Path, - input_queue: Queue, -) -> None: - output_tif = result_dir / filename - - merged_result: Optional[RasterLayer] = None - - while True: - try: - path: Path = input_queue.get_nowait() - except queue.Empty: - break - - with RasterLayer.layer_from_file(path) as partial_raster: - if merged_result is None: - merged_result = RasterLayer.empty_raster_layer_like(partial_raster, datatype=DataType.Float64) - cleaned_raster = partial_raster.nan_to_num(nan=0.0) - cleaned_raster.save(merged_result) - else: - calc = merged_result + partial_raster.nan_to_num(nan=0.0) - temp = RasterLayer.empty_raster_layer_like(calc, datatype=DataType.Float64) - calc.save(temp) - merged_result = temp - - if merged_result: - with RasterLayer.empty_raster_layer_like(merged_result, filename=output_tif, compress=compress) as final: - merged_result.save(final) - input_queue.put(output_tif) +import yirgacheffe as yg +from alive_progress import alive_bar # type: ignore +from snakemake_argparse_bridge import snakemake_compatible # type: ignore def raster_sum( images_dir: Path, output_filename: Path, - processes_count: int ) -> None: - print(f"process count set to {processes_count}") - + # We'll be opening all the deltap files per taxa in one, so we'll need to raise + # the number of files we can open. _, max_fd_limit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (max_fd_limit, max_fd_limit)) - print(f"Set fd limit to {max_fd_limit}") - - os.makedirs(output_filename.parent, exist_ok=True) - - files = list(images_dir.glob("*.tif")) - if not files: - sys.exit(f"No files in {images_dir}, aborting") - print(f"Found {len(files)} images to process") - - with tempfile.TemporaryDirectory() as tempdir: - with Manager() as manager: - source_queue = manager.Queue() - - for file in files: - source_queue.put(file) - - workers = [Process(target=worker, args=( - False, - f"{index}.tif", - Path(tempdir), - source_queue - )) for index in range(processes_count)] - for worker_process in workers: - worker_process.start() - - processes = workers - while processes: - candidates = [x for x in processes if not x.is_alive()] - for candidate in candidates: - candidate.join() - if candidate.exitcode: - for victim in processes: - victim.kill() - sys.exit(candidate.exitcode) - processes.remove(candidate) - time.sleep(0.1) - - # here we should have now a set of images in tempdir to merge - single_worker = Process(target=worker, args=( - True, - output_filename.name, - output_filename.parent, - source_queue - )) - single_worker.start() - - processes = [single_worker] - while processes: - candidates = [x for x in processes if not x.is_alive()] - for candidate in candidates: - candidate.join() - if candidate.exitcode: - for victim in processes: - victim.kill() - sys.exit(candidate.exitcode) - processes.remove(candidate) - time.sleep(1) + layers = [yg.read_raster(x) for x in images_dir.glob("*.tif")] + total = yg.sum(layers) + with alive_bar(manual=True) as bar: + total.to_geotiff(output_filename, callback=bar, parallelism=True) +@snakemake_compatible(mapping={ + "rasters_directory": "params.rasters_dir", + "output_filename": "output[0]", +}) def main() -> None: parser = argparse.ArgumentParser(description="Sums many rasters into a single raster") parser.add_argument( @@ -128,20 +40,11 @@ def main() -> None: dest="output_filename", help="Destination geotiff file for results." ) - parser.add_argument( - "-j", - type=int, - required=False, - default=round(cpu_count() / 2), - dest="processes_count", - help="Number of concurrent threads to use." - ) args = parser.parse_args() raster_sum( args.rasters_directory, args.output_filename, - args.processes_count ) if __name__ == "__main__": diff --git a/utils/regression_plot.py b/utils/regression_plot.py index 48894f6..373b0d5 100644 --- a/utils/regression_plot.py +++ b/utils/regression_plot.py @@ -9,7 +9,7 @@ import matplotlib.pyplot as plt import numpy as np -from yirgacheffe.layers import RasterLayer +import yirgacheffe as yg def filter_data(chunks): a_chunk, b_chunk = chunks @@ -30,17 +30,19 @@ def regression_plot( output_dir, _ = os.path.split(output_path) os.makedirs(output_dir, exist_ok=True) - with RasterLayer.layer_from_file(a_path) as a_layer: - with RasterLayer.layer_from_file(b_path) as b_layer: - if a_layer.pixel_scale != b_layer.pixel_scale: - sys.exit("GeoTIFFs have different pixel scale") - if a_layer.area != b_layer.area: - sys.exit("GeoTIFFs have different spatial coordinates") - if a_layer.window != b_layer.window: - sys.exit("GeoTIFFs have different pixel dimensions") + with ( + yg.read_raster(a_path) as a_layer, + yg.read_raster(b_path) as b_layer, + ): + if a_layer.pixel_scale != b_layer.pixel_scale: + sys.exit("GeoTIFFs have different pixel scale") + if a_layer.area != b_layer.area: + sys.exit("GeoTIFFs have different spatial coordinates") + if a_layer.window != b_layer.window: + sys.exit("GeoTIFFs have different pixel dimensions") - a_pixels = a_layer.read_array(0, 0, a_layer.window.xsize, a_layer.window.ysize) - b_pixels = b_layer.read_array(0, 0, b_layer.window.xsize, b_layer.window.ysize) + a_pixels = a_layer.read_array(0, 0, a_layer.window.xsize, a_layer.window.ysize) + b_pixels = b_layer.read_array(0, 0, b_layer.window.xsize, b_layer.window.ysize) with Pool(processes=cpu_count() // 2) as pool: filtered_chunk_pairs = pool.map(filter_data, zip(a_pixels, b_pixels)) diff --git a/utils/species_totals.py b/utils/species_totals.py index dfac10b..351b8f5 100644 --- a/utils/species_totals.py +++ b/utils/species_totals.py @@ -3,6 +3,7 @@ from typing import Dict import pandas as pd +from snakemake_argparse_bridge import snakemake_compatible # type: ignore TAXA = ["AMPHIBIA", "AVES", "MAMMALIA", "REPTILIA"] @@ -20,6 +21,10 @@ def species_totals( df.loc[-1] = ["all", df["count"].sum()] df.to_csv(output_path, index=False) +@snakemake_compatible(mapping={ + "deltaps_path": "params.deltaps_dir", + "output_filename": "output.totals", +}) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( diff --git a/utils/speciesgenerator.py b/utils/speciesgenerator.py index 85e2808..ccd538d 100644 --- a/utils/speciesgenerator.py +++ b/utils/speciesgenerator.py @@ -41,17 +41,15 @@ def species_generator( habitat_maps_path, data_dir / "elevation-max.tif", data_dir / "elevation-min.tif", - data_dir / "area-per-pixel.tif", data_dir / "crosswalk.csv", species, aohs_path / scenario / taxa, ]) df = pd.DataFrame(res, columns=[ - '--habitats', + '--fractional_habitats', '--elevation-max', '--elevation-min', - '--area', '--crosswalk', '--speciesdata', '--output' diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..b179ea9 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,193 @@ +# LIFE Pipeline - Snakemake Workflow +# =================================== +# +# This workflow implements the LIFE biodiversity metric methodology. +# It processes species data, generates Area of Habitat (AoH) rasters, and +# produces the different scenario output rasters. +# +# Usage: +# snakemake --cores N all +# snakemake --cores N life +# snakemake --cores N summaries +# snakemake --cores N validation +# +# Environment variables required: +# DATADIR - Directory for input/output data +# DB_HOST, DB_NAME, DB_USER, DB_PASSWORD - PostgreSQL credentials +# +# For GBIF validation (optional): +# GBIF_USERNAME, GBIF_EMAIL, GBIF_PASSWORD + +import os +from pathlib import Path + +# Allow GDAL/OGR to handle large GeoJSON objects +os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0" + +# Source directory (where this repo lives) +SRCDIR = Path(workflow.basedir).parent + + +# Load configuration +configfile: SRCDIR / "config/config.yaml" + + +# Data directory from environment variable +DATADIR = Path(os.environ.get("DATADIR", "/data")) + +# Taxa list from config +TAXA = config["taxa"] + +# Scenarios from config +SCENARIOS = config["scenarios"] + +# Z-curve value (single value, not a wildcard) +CURVE = config["curve"] + +# All scenarios used for AOH generation +ALL_AOH_SCENARIOS = SCENARIOS + ["current", "pnv"] + + +# ============================================================================= +# Utility Functions for Checkpoint-Based Expansion +# ============================================================================= +# These must be defined before the includes so they are available when +# the rule modules are parsed. + + +def get_species_era(scenario): + """Return the species era (current/historic) for a given scenario.""" + return "historic" if scenario == "pnv" else "current" + + +def get_species_ids_for_taxa_scenario(wildcards): + """ + Return species IDs for a given taxa and scenario. + Uses the extract_species_data checkpoint output. + """ + era = get_species_era(wildcards.scenario) + checkpoint_output = checkpoints.extract_species_data.get(taxa=wildcards.taxa).output + if era == "current": + report_path = Path(str(checkpoint_output.current_report)) + else: + report_path = Path(str(checkpoint_output.historic_report)) + geojson_dir = report_path.parent + return [p.stem[6:] for p in sorted(geojson_dir.glob("*.geojson"))] + + +def get_all_aoh_metadata_for_taxa_scenario(wildcards): + """ + Return paths to all AOH metadata JSON files for a taxa and scenario. + """ + species_ids = get_species_ids_for_taxa_scenario(wildcards) + return [ + DATADIR / "aohs" / wildcards.scenario / wildcards.taxa / f"aoh_{sid}.json" + for sid in species_ids + ] + + +def get_species_ids_for_delta_p(wildcards): + """Return species IDs for delta P (always from current/ era).""" + checkpoint_output = checkpoints.extract_species_data.get(taxa=wildcards.taxa).output + geojson_dir = Path(str(checkpoint_output.current_report)).parent + return [p.stem for p in sorted(geojson_dir.glob("*.geojson"))] + + +def get_delta_p_sentinels_for_taxa_scenario(wildcards): + # Whatever logic you had in get_delta_p_sentinels_for_taxa_scenario, + # but returning .tif paths instead of sentinel paths + species_ids = get_species_ids_for_taxa_scenario(wildcards) + return [ + DATADIR + / "deltap" + / wildcards.scenario + / CURVE + / wildcards.taxa + / f".{sid}.done" + for sid in species_ids + ] + + +# Include rule modules +include: "rules/prepare.smk" +include: "rules/habitat.smk" +include: "rules/scenario.smk" +include: "rules/species.smk" +include: "rules/aoh.smk" +include: "rules/delta_p.smk" +include: "rules/validation.smk" + + +# ============================================================================= +# Target Rules +# ============================================================================= + + +rule all: + """Default target: full LIFE pipeline + model validation.""" + input: + expand( + str(DATADIR / "deltap_final" / "scaled_{scenario}_{curve}.tif"), + scenario=SCENARIOS, + curve=[CURVE], + ), + DATADIR / "validation" / "model_validation.csv", + + +rule life: + """Target: run the LIFE pipeline without validation.""" + input: + expand( + str(DATADIR / "deltap_final" / "scaled_{scenario}_{curve}.tif"), + scenario=SCENARIOS, + curve=[CURVE], + ), + + +rule summaries: + """ +Target: generate species richness and endemism maps. +NOT included in 'all' — run explicitly with: snakemake summaries +""" + input: + DATADIR / "summaries" / "species_richness.tif", + DATADIR / "summaries" / "endemism.tif", + + +rule aohs: + """Target: generate all AOH rasters and collated summary CSVs.""" + input: + expand( + str(DATADIR / "aohs" / "{scenario}" / "{taxa}" / ".complete"), + scenario=ALL_AOH_SCENARIOS, + taxa=TAXA, + ), + expand( + str(DATADIR / "aohs" / "{scenario}.csv"), + scenario=ALL_AOH_SCENARIOS, + ), + + +rule species_data: + """Target: extract species data from PostgreSQL.""" + input: + expand( + str(DATADIR / "species-info" / "{taxa}" / "current" / "report.csv"), + taxa=TAXA, + ), + + +rule prepare: + """Target: prepare all base layers.""" + input: + DATADIR / "crosswalk.csv", + DATADIR / "habitat_layers" / "current" / ".sentinel", + DATADIR / "habitat_layers" / "pnv" / ".sentinel", + DATADIR / "elevation-max.tif", + DATADIR / "elevation-min.tif", + + +rule validation: + """Target: run model validation.""" + input: + DATADIR / "validation" / "model_validation.csv", diff --git a/workflow/profiles/default/config.yaml b/workflow/profiles/default/config.yaml new file mode 100644 index 0000000..238b1e0 --- /dev/null +++ b/workflow/profiles/default/config.yaml @@ -0,0 +1,2 @@ +resources: + db_connections: 1 diff --git a/workflow/rules/aoh.smk b/workflow/rules/aoh.smk new file mode 100644 index 0000000..72a8951 --- /dev/null +++ b/workflow/rules/aoh.smk @@ -0,0 +1,166 @@ +# LIFE Pipeline - Area of Habitat (AoH) Generation Rules +# ======================================================= +# +# Generates AoH rasters for each species across all scenarios. +# Scenarios: current, pnv, and user-defined scenarios (arable, restore, ...) +# +# For current/scenario AOHs: uses species-info/{taxa}/current/ geojsons +# For pnv AOHs: uses species-info/{taxa}/historic/ geojsons +# +# Code-sensitive: rebuilds if the aoh package version changes. + +import os +from pathlib import Path + +# ============================================================================= +# Version Sentinel for AOH Code +# ============================================================================= + + +rule aoh_version_sentinel: + """ +Track the aoh package version. AOH rules depend on this to trigger +rebuilds when the package updates. +""" + output: + sentinel=DATADIR / ".sentinels" / "aoh_version.txt", + run: + import subprocess + + os.makedirs(os.path.dirname(output.sentinel), exist_ok=True) + try: + result = subprocess.run( + ["aoh-calc", "--version"], + capture_output=True, + text=True, + check=True, + ) + aoh_version = result.stdout.strip() + except Exception: + aoh_version = "unknown" + with open(output.sentinel, "w") as f: + f.write(f"aoh: {aoh_version}\n") + + +# ============================================================================= +# Per-Species AOH Generation +# ============================================================================= + + +def aoh_species_inputs(wildcards): + """Return inputs for generate_aoh.""" + era = "historic" if wildcards.scenario == "pnv" else "current" + return { + "species_data": DATADIR + / "species-info" + / wildcards.taxa + / era + / f"range_{wildcards.species_id}.geojson", + "habitat_sentinel": ancient( + DATADIR / "habitat_layers" / wildcards.scenario / ".sentinel" + ), + "elevation_max": ancient(DATADIR / "elevation-max.tif"), + "elevation_min": ancient(DATADIR / "elevation-min.tif"), + "crosswalk": DATADIR / "crosswalk.csv", + "version_sentinel": DATADIR / ".sentinels" / "aoh_version.txt", + } + + +rule generate_aoh: + """ +Generate Area of Habitat raster for a single species in a given scenario. + +Parallelizable: run with `snakemake --cores N` to process multiple species +concurrently. Uses --force-habitat and --pixel-area flags (no mask). + +For current/scenario AOHs: uses current/ era species data. +For pnv AOHs: uses historic/ era species data. +""" + input: + unpack(aoh_species_inputs), + output: + metadata=DATADIR / "aohs" / "{scenario}" / "{taxa}" / "aoh_{species_id}.json", + log: + DATADIR / "logs" / "aoh" / "{scenario}" / "{taxa}" / "aoh_{species_id}.log", + resources: + aoh_slots=1, + params: + habitat_dir=lambda wildcards: DATADIR / "habitat_layers" / wildcards.scenario, + output_dir=lambda wildcards: DATADIR + / "aohs" + / wildcards.scenario + / wildcards.taxa, + shell: + """ + mkdir -p $(dirname {log}) + mkdir -p {params.output_dir} + aoh-calc \ + --fractional_habitats {params.habitat_dir} \ + --elevation-max {input.elevation_max} \ + --elevation-min {input.elevation_min} \ + --crosswalk {input.crosswalk} \ + --speciesdata {input.species_data} \ + --output {params.output_dir} \ + --force-habitat \ + --pixel-area \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Per-Taxa AOH Aggregation (Checkpoint) +# ============================================================================= + + +rule aggregate_aohs_per_taxa: + """ +Checkpoint that ensures all AOHs for a taxa/scenario are generated. +Creates a sentinel file when complete. + +This is a checkpoint so downstream rules (delta P) can re-evaluate the DAG +after AOHs are created. +""" + input: + metadata=get_all_aoh_metadata_for_taxa_scenario, + version_sentinel=DATADIR / ".sentinels" / "aoh_version.txt", + output: + sentinel=DATADIR / "aohs" / "{scenario}" / "{taxa}" / ".complete", + shell: + """ + echo "Generated $(echo {input.metadata} | wc -w) AOHs for {wildcards.taxa}/{wildcards.scenario}" + touch {output.sentinel} + """ + + +# ============================================================================= +# Collate AOH Data (per scenario) +# ============================================================================= + + +rule collate_aoh_data: + """ +Collate metadata from all AOH JSON files for a scenario into a single CSV. + +Used by validation (current scenario) and for downstream analysis. +""" + input: + sentinels=lambda wildcards: expand( + str(DATADIR / "aohs" / "{scenario}" / "{taxa}" / ".complete"), + scenario=wildcards.scenario, + taxa=TAXA, + ), + version_sentinel=DATADIR / ".sentinels" / "aoh_version.txt", + output: + collated=DATADIR / "aohs" / "{scenario}.csv", + log: + DATADIR / "logs" / "collate_aoh_{scenario}.log", + params: + aoh_results_dir=lambda wildcards: DATADIR / "aohs" / wildcards.scenario, + shell: + """ + mkdir -p $(dirname {output.collated}) + aoh-collate-data \ + --aoh_results {params.aoh_results_dir} \ + --output {output.collated} \ + 2>&1 | tee {log} + """ diff --git a/workflow/rules/delta_p.smk b/workflow/rules/delta_p.smk new file mode 100644 index 0000000..9ee11dc --- /dev/null +++ b/workflow/rules/delta_p.smk @@ -0,0 +1,149 @@ +# LIFE Pipeline - Delta P (Persistence) Rules +# ============================================ +# +# Calculates per-species change in probability of persistence (Delta P) +# for each user-defined scenario, then aggregates to produce final maps. +# +# Pipeline per scenario: +# 1. calculate_delta_p: per species, uses current + scenario + pnv AOHs +# 2. aggregate_delta_p_per_taxa: sentinel that all species are done +# 3. raster_sum_per_taxa: sum per-species delta P values per taxa +# 4. species_totals: count species per taxa for normalisation +# 5. delta_p_scaled: final scaled output map + + +import os +from pathlib import Path +from types import SimpleNamespace + +# ============================================================================= +# Per-Species Delta P Calculation +# ============================================================================= + + +rule calculate_delta_p: + """ +Calculate the change in probability of persistence for a single species +under a given scenario. + +species_id wildcard is of the form T{taxon_id}A{assessment_id}_{SEASON}, +e.g. T22685505A261477056_RESIDENT. +""" + input: + current_sentinel=DATADIR / "aohs" / "current" / "{taxa}" / ".complete", + scenario_sentinel=DATADIR / "aohs" / "{scenario}" / "{taxa}" / ".complete", + pnv_sentinel=DATADIR / "aohs" / "pnv" / "{taxa}" / ".complete", + output: + sentinel=DATADIR + / "deltap" + / "{scenario}" + / CURVE + / "{taxa}" + / ".{species_id}.done", + log: + DATADIR / "logs" / "deltap" / "{scenario}" / "{taxa}" / "{species_id}.log", + params: + current_path=lambda wildcards: DATADIR / "aohs" / "current" / wildcards.taxa, + scenario_path=lambda wildcards: DATADIR + / "aohs" + / wildcards.scenario + / wildcards.taxa, + pnv_path=lambda wildcards: DATADIR / "aohs" / "pnv" / wildcards.taxa, + taxon_id=lambda wildcards: wildcards.species_id.rsplit("_", 1)[0], + season=lambda wildcards: wildcards.species_id.rsplit("_", 1)[1], + curve=CURVE, + output_tif=lambda wildcards: DATADIR + / "deltap" + / wildcards.scenario + / CURVE + / wildcards.taxa + / f"deltap_{wildcards.species_id}.tif", + script: + str(SRCDIR / "deltap" / "global_code_residents_pixel.py") + + +# ============================================================================= +# Per-Taxa Raster Sum +# ============================================================================= + + +rule raster_sum_per_taxa: + """ +Sum all per-species delta P rasters for a taxa into a single raster. +Implicitly waits for all calculate_delta_p jobs via direct tif dependencies. +""" + input: + rasters=get_delta_p_sentinels_for_taxa_scenario, + output: + tif=DATADIR / "deltap_sum" / "{scenario}" / CURVE / "{taxa}.tif", + log: + DATADIR / "logs" / "raster_sum" / "{scenario}" / "{taxa}.log", + threads: workflow.cores + params: + rasters_dir=lambda wildcards: DATADIR + / "deltap" + / wildcards.scenario + / CURVE + / wildcards.taxa, + curve=CURVE, + script: + str(SRCDIR / "utils" / "raster_sum.py") + + +# ============================================================================= +# Species Totals +# ============================================================================= + + +def get_all_delta_p_tifs_for_scenario(wildcards): + tifs = [] + for taxa in TAXA: + mock = SimpleNamespace(taxa=taxa, scenario=wildcards.scenario) + tifs.extend(get_delta_p_sentinels_for_taxa_scenario(mock)) + return tifs + + +rule species_totals: + """ +Count the number of species per taxa used in the delta P calculation. +Used by delta_p_scaled for normalisation. +""" + input: + rasters=get_all_delta_p_tifs_for_scenario, + output: + totals=DATADIR / "deltap" / "{scenario}" / CURVE / "totals.csv", + log: + DATADIR / "logs" / "species_totals_{scenario}.log", + params: + deltaps_dir=lambda wildcards: DATADIR / "deltap" / wildcards.scenario / CURVE, + script: + str(SRCDIR / "utils" / "species_totals.py") + + +# ============================================================================= +# Final Scaled Delta P Map +# ============================================================================= + + +rule delta_p_scaled: + """ +Generate the final scaled delta P map for a scenario. + +Combines per-taxa delta P sums with the habitat difference map and +species totals to produce the final normalised LIFE output. +""" + input: + taxa_rasters=expand( + str(DATADIR / "deltap_sum" / "{{scenario}}" / CURVE / "{taxa}.tif"), + taxa=TAXA, + ), + diffmap=DATADIR / "habitat" / "{scenario}_diff_area.tif", + totals=DATADIR / "deltap" / "{scenario}" / CURVE / "totals.csv", + output: + final=DATADIR / "deltap_final" / f"scaled_{{scenario}}_{CURVE}.tif", + log: + DATADIR / "logs" / "delta_p_scaled_{scenario}.log", + params: + input_dir=lambda wildcards: DATADIR / "deltap_sum" / wildcards.scenario / CURVE, + script: + str(SRCDIR / "deltap" / "delta_p_scaled.py") diff --git a/workflow/rules/habitat.smk b/workflow/rules/habitat.smk new file mode 100644 index 0000000..d01540a --- /dev/null +++ b/workflow/rules/habitat.smk @@ -0,0 +1,332 @@ +# LIFE Pipeline - Habitat layer rules +# ===================================== +# +# Handles the two-phase habitat map generation: +# +# Phase 1 (generate_food_map.sh equivalent): +# - Download GAEZ and HYDE data +# - Combine GAEZ/HYDE into crop/pasture fractional layers +# - Build Jung current map at 100m +# - Rescale PNV to 100m +# - Build food-enhanced current map at 100m (PRECIOUS) +# +# Phase 2 (run.sh equivalent): +# - Warp current 100m → habitat_layers/current/ (PRECIOUS) +# - Process PNV → habitat_layers/pnv/ via aoh-habitat-process (PRECIOUS) + + +import os +from pathlib import Path + +# ============================================================================= +# GAEZ data +# ============================================================================= + + +rule gaez_download: + """ +Fetch the compressed GAEZ data. +""" + output: + archive=DATADIR / "food" / "LR.zip", + log: + DATADIR / "logs" / "download_gaez.log", + params: + url="https://s3.eu-west-1.amazonaws.com/data.gaezdev.aws.fao.org/LR.zip", + shell: + """ + mkdir -p $(dirname {output.archive}) + curl -o {output.archive} \ + {params.url} \ + 2>&1 | tee {log} + """ + + +rule gaez_expand: + """Decompress the GAEZ download.""" + input: + archive=DATADIR / "food" / "LR.zip", + output: + raster=DATADIR / "food" / "GLCSv11_02_5m.tif", + log: + DATADIR / "logs" / "expand_gaez.log", + params: + filepath="LR/lco/GLCSv11_02_5m.tif", + shell: + """ + unzip -j {input.archive} \ + {params.filepath} \ + -d $(dirname {output.raster}) \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Hyde data +# ============================================================================= + + +rule hyde_download: + """Fetch the compressed HYDE data.""" + output: + archive=DATADIR / "food" / "baseline.zip", + log: + DATADIR / "logs" / "download_hyde.log", + params: + url="https://geo.public.data.uu.nl/vault-hyde/HYDE%203.2%5B1710494848%5D/original/baseline.zip", + shell: + """ + mkdir -p $(dirname {output.archive}) + curl -o {output.archive} \ + {params.url} \ + 2>&1 | tee {log} + """ + + +rule hyde_expand_land_usage_archive: + """Extract the inner land usage archive from HYDE.""" + input: + archive=DATADIR / "food" / "baseline.zip", + output: + inner_hyde=DATADIR / "food" / "2017AD_lu.zip", + log: + DATADIR / "logs" / "expand_hyde_1.log", + params: + filepath="baseline/zip/2017AD_lu.zip", + shell: + """ + unzip -j {input.archive} \ + {params.filepath} \ + -d $(dirname {output.inner_hyde}) \ + 2>&1 | tee {log} + """ + + +rule hyde_expand_land_usage_raster: + """Extract raster from the inner HYDE land usage archive.""" + input: + archive=DATADIR / "food" / "2017AD_lu.zip", + output: + raw_hyde_raster=DATADIR / "food" / "grazing2017AD.asc", + log: + DATADIR / "logs" / "expand_hyde_2.log", + params: + filepath="grazing2017AD.asc", + shell: + """ + unzip -j {input.archive} \ + {params.filepath} \ + -d $(dirname {output.raw_hyde_raster}) \ + 2>&1 | tee {log} + """ + + +rule modify_hyde_pixel_scale: + """ +Fix the pixel scale value in the HYDE data — the rounding is not precise +enough to align with the GAEZ data. +""" + input: + raw_hyde_raster=DATADIR / "food" / "grazing2017AD.asc", + output: + modified_hyde_raster=DATADIR / "food" / "modified_grazing2017AD.asc", + shell: + """ + sed "s/0.0833333/0.08333333333333333/" \ + {input.raw_hyde_raster} > {output.modified_hyde_raster} + """ + + +rule add_hyde_projection_info: + """ +The HYDE data ships without a projection, so add one so that +the rest of the workflow can mix it with projected raster data. +""" + output: + hyde_projection_file=DATADIR / "food" / "modified_grazing2017AD.prj", + params: + projection=config["hyde_projection"], + shell: + """ + echo '{params.projection}' > {output.hyde_projection_file} + """ + + +# ============================================================================= +# Combine GAEZ/Hyde data +# ============================================================================= +rule combine_gaez_hyde: + """ +Combine the GAEZ and Hyde data, adjusting for overflow in cells. +""" + input: + hyde_projection_file=DATADIR / "food" / "modified_grazing2017AD.prj", + hyde_raster=DATADIR / "food" / "modified_grazing2017AD.asc", + gaez_raster=DATADIR / "food" / "GLCSv11_02_5m.tif", + output: + crop=DATADIR / "food" / "crop.tif", + pasture=DATADIR / "food" / "pasture.tif", + params: + output_dir=DATADIR / "food", + script: + str(SRCDIR / "prepare_layers" / "build_gaez_hyde.py") + + +# ============================================================================= +# PNV rescaled to 100m +# ============================================================================= + + +rule pnv_100m: + """ +Rescale the PNV map to 100m resolution. +Yirgacheffe can rescale dynamically, but pre-scaling is faster at the cost +of some extra disk space. +""" + input: + pnv=DATADIR / "habitat" / "pnv_raw.tif", + output: + pnv_100m=DATADIR / "100m" / "pnv.tif", + log: + DATADIR / "logs" / "pnv_100m.log", + shell: + """ + gdalwarp \ + -t_srs EPSG:4326 \ + -tr 0.000898315284120 -0.000898315284120 \ + -r near \ + -tap \ + -multi -wo NUM_THREADS=ALL_CPUS \ + -co COMPRESS=LZW -co NUM_THREADS=ALL_CPUS \ + {input.pnv} \ + {output.pnv_100m} \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Food-enhanced current map at 100m (PRECIOUS) +# ============================================================================= + + +rule current_raws: + """ +Build the LIFE current map, which is Jung with updates applied +and restricted to L1 to match the PNV map restrictions. +""" + input: + updates_sentinel=DATADIR / "habitat" / ".downloaded_updates", + habitat=DATADIR / "100m" / "jung_l2_raw.tif", + crosswalk=DATADIR / "crosswalk.csv", + output: + sentinel=DATADIR / "100m" / "jung_current" / ".sentinel", + threads: workflow.cores + params: + updates_dir=DATADIR / "habitat" / "lvl2_changemasks_ver004", + output_dir=DATADIR / "100m" / "jung_current", + script: + str(SRCDIR / "prepare_layers" / "make_current_map.py") + + +rule build_food_map: + """ +Build the food-enhanced current habitat map at 100m resolution by combining +the Jung current map with GAEZ/HYDE crop and pasture fractions. + +PRECIOUS: Only rebuilds if the sentinel is explicitly deleted. +This is the most expensive step in the pipeline. +""" + input: + jung=ancient(DATADIR / "100m" / "jung_current" / ".sentinel"), + pnv=ancient(DATADIR / "100m" / "pnv.tif"), + crop=ancient(DATADIR / "food" / "crop.tif"), + pasture=ancient(DATADIR / "food" / "pasture.tif"), + output: + sentinel=DATADIR / "100m" / "current" / ".sentinel", + log: + DATADIR / "logs" / "build_food_map.log", + threads: workflow.cores + params: + jung_dir=DATADIR / "100m" / "jung_current", + output_dir=DATADIR / "100m" / "current", + script: + str(SRCDIR / "prepare_layers" / "make_food_current_map.py") + + +# ============================================================================= +# Warp current map to habitat_layers resolution (PRECIOUS) +# ============================================================================= + + +rule warp_current: + """ +Warp the food-enhanced current map from 100m to the target pixel scale +(5 arc-seconds, ~1.67km at the equator). + +PRECIOUS: Only rebuilds if the sentinel is explicitly deleted. +""" + input: + sentinel=ancient(DATADIR / "100m" / "current" / ".sentinel"), + output: + sentinel=DATADIR / "habitat_layers" / "current" / ".sentinel", + log: + DATADIR / "logs" / "warp_current.log", + threads: workflow.cores + params: + input_dir=DATADIR / "100m" / "current", + output_dir=DATADIR / "habitat_layers" / "current", + pixel_scale=config["pixel_scale"], + shell: + """ + mkdir -p {params.output_dir} + for d in {params.input_dir}/*.tif; do + basename=$(basename "$d") + gdalwarp \ + -t_srs EPSG:4326 \ + -tr {params.pixel_scale} -{params.pixel_scale} \ + -r average \ + -multi \ + -co COMPRESS=LZW \ + -co NUM_THREADS={threads} \ + -wo NUM_THREADS={threads} \ + "$d" \ + {params.output_dir}/"$basename" \ + 2>&1 + done 2>&1 | tee {log} + touch {output.sentinel} + """ + + +# ============================================================================= +# Process PNV map to habitat_layers (PRECIOUS) +# ============================================================================= + + +rule pnv_processed: + """ +Process the PNV map into per-class fractional rasters at the target pixel scale +using aoh-habitat-process. + +PRECIOUS: Only rebuilds if the sentinel is explicitly deleted. +""" + input: + pnv=ancient(DATADIR / "habitat" / "pnv_raw.tif"), + output: + sentinel=DATADIR / "habitat_layers" / "pnv" / ".sentinel", + log: + DATADIR / "logs" / "pnv_processed.log", + params: + output_dir=DATADIR / "habitat_layers" / "pnv", + pixel_scale=config["pixel_scale"], + projection=config["projection"], + shell: + """ + set -e + aoh-habitat-process \ + --habitat {input.pnv} \ + --scale {params.pixel_scale} \ + --projection {params.projection} \ + --output {params.output_dir} \ + 2>&1 | tee {log} + touch {output.sentinel} + """ diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk new file mode 100644 index 0000000..e69be79 --- /dev/null +++ b/workflow/rules/prepare.smk @@ -0,0 +1,185 @@ +# LIFE Pipeline - Prepare base layers +# ===================================== +# +# Handles: +# - Crosswalk table generation +# - Jung L2 habitat map download +# - Jung habitat updates download +# - Jung PNV map download +# - Elevation download and warp (precious) + +import os +from pathlib import Path + +# ============================================================================= +# Crosswalk Table +# ============================================================================= + + +rule convert_crosswalk: + """ +Generate a crosswalk between IUCN Habitat classes and Jung raster pixel values. +""" + output: + crosswalk=DATADIR / "crosswalk.csv", + script: + str(SRCDIR / "prepare_layers" / "generate_crosswalk.py") + + +# ============================================================================= +# Jung L2 habitat map +# ============================================================================= + + +rule jung_habitat_map: + """ +Fetch the Jung L2 habitat map from Zenodo. +""" + output: + habitat=DATADIR / "100m" / "jung_l2_raw.tif", + log: + DATADIR / "logs" / "download_habitat.log", + params: + zenodo_id=config["zenodo"]["jung_habitat"]["zenodo_id"], + filename=config["zenodo"]["jung_habitat"]["filename"], + shell: + """ + mkdir -p $(dirname {output.habitat}) + reclaimer zenodo --zenodo_id {params.zenodo_id} \ + --filename "{params.filename}" \ + --extract \ + --output {output.habitat} \ + 2>&1 | tee {log} + """ + + +rule jung_habitat_updates: + """ +Fetch the Jung habitat map update masks from Zenodo. +""" + output: + sentinel=DATADIR / "habitat" / ".downloaded_updates", + log: + DATADIR / "logs" / "download_habitat_updates.log", + params: + habitat_dir=DATADIR / "habitat", + zenodo_id=config["zenodo"]["jung_habitat_updates"]["zenodo_id"], + filename=config["zenodo"]["jung_habitat_updates"]["filename"], + shell: + """ + mkdir -p {params.habitat_dir} + reclaimer zenodo --zenodo_id {params.zenodo_id} \ + --filename "{params.filename}" \ + --extract \ + --output {params.habitat_dir} \ + 2>&1 | tee {log} + touch {output.sentinel} + """ + + +# ============================================================================= +# Jung PNV map +# ============================================================================= + + +rule jung_pnv_map: + """ +Fetch the Jung PNV map from Zenodo. +""" + output: + habitat=DATADIR / "habitat" / "pnv_raw.tif", + log: + DATADIR / "logs" / "download_pnv.log", + params: + zenodo_id=config["zenodo"]["jung_pnv"]["zenodo_id"], + filename=config["zenodo"]["jung_pnv"]["filename"], + shell: + """ + mkdir -p $(dirname {output.habitat}) + reclaimer zenodo --zenodo_id {params.zenodo_id} \ + --filename "{params.filename}" \ + --extract \ + --output {output.habitat} \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Elevation layers (precious — only rebuild if explicitly deleted) +# ============================================================================= + + +rule download_elevation: + """ +Download raw elevation DEM from Zenodo. +""" + output: + elevation=DATADIR / "elevation.tif", + log: + DATADIR / "logs" / "download_elevation.log", + params: + zenodo_id=config["zenodo"]["elevation"]["zenodo_id"], + filename=config["zenodo"]["elevation"]["filename"], + shell: + """ + reclaimer zenodo --zenodo_id {params.zenodo_id} \ + --filename "{params.filename}" \ + --output {output.elevation} \ + 2>&1 | tee {log} + """ + + +rule elevation_max: + """ +Warp elevation to target projection and scale using max resampling. +Precious: only runs if output doesn't exist. +""" + input: + elevation=ancient(DATADIR / "elevation.tif"), + output: + elevation_max=DATADIR / "elevation-max.tif", + log: + DATADIR / "logs" / "elevation_max.log", + threads: workflow.cores + params: + pixel_scale=config["pixel_scale"], + shell: + """ + gdalwarp \ + -t_srs EPSG:4326 \ + -tr {params.pixel_scale} -{params.pixel_scale} \ + -r max \ + -co COMPRESS=LZW \ + -wo NUM_THREADS={threads} \ + {input.elevation} \ + {output.elevation_max} \ + 2>&1 | tee {log} + """ + + +rule elevation_min: + """ +Warp elevation to target projection and scale using min resampling. +Precious: only runs if output doesn't exist. +""" + input: + elevation=ancient(DATADIR / "elevation.tif"), + output: + elevation_min=DATADIR / "elevation-min.tif", + log: + DATADIR / "logs" / "elevation_min.log", + threads: workflow.cores + params: + pixel_scale=config["pixel_scale"], + shell: + """ + gdalwarp \ + -t_srs EPSG:4326 \ + -tr {params.pixel_scale} -{params.pixel_scale} \ + -r min \ + -co COMPRESS=LZW \ + -wo NUM_THREADS={threads} \ + {input.elevation} \ + {output.elevation_min} \ + 2>&1 | tee {log} + """ diff --git a/workflow/rules/scenario.smk b/workflow/rules/scenario.smk new file mode 100644 index 0000000..efd92b9 --- /dev/null +++ b/workflow/rules/scenario.smk @@ -0,0 +1,210 @@ +# LIFE Pipeline - Scenario map rules +# ===================================== +# +# Handles generating scenario-specific habitat maps and difference maps: +# +# - Arable: all non-urban land converted to arable +# - Restore: agricultural land restored to PNV +# +# Each scenario: +# 1. Generate 100m scenario map from current map +# 2. Warp to habitat_layers/{scenario}/ at target pixel scale (PRECIOUS) +# 3. Generate diff map comparing current vs scenario + + +import os +from pathlib import Path + +# ============================================================================= +# Arable scenario +# ============================================================================= + + +rule make_arable_map: + """ +Generate the arable scenario map at 100m resolution. +All non-urban land is converted to arable. +""" + input: + current_sentinel=DATADIR / "100m" / "current" / ".sentinel", + output: + sentinel=DATADIR / "100m" / "arable" / ".sentinel", + log: + DATADIR / "logs" / "make_arable_map.log", + threads: workflow.cores + params: + current_dir=DATADIR / "100m" / "current", + output_dir=DATADIR / "100m" / "arable", + shell: + """ + python3 {SRCDIR}/prepare_layers/make_arable_map.py \ + --current {params.current_dir} \ + --output {params.output_dir} \ + -j 2 \ + 2>&1 | tee {log} + touch {output.sentinel} + """ + + +rule warp_arable: + """ +Warp the arable scenario map from 100m to the target pixel scale. +PRECIOUS: Only rebuilds if the sentinel is explicitly deleted. +""" + input: + sentinel=ancient(DATADIR / "100m" / "arable" / ".sentinel"), + output: + sentinel=DATADIR / "habitat_layers" / "arable" / ".sentinel", + log: + DATADIR / "logs" / "warp_arable.log", + threads: workflow.cores + params: + input_dir=DATADIR / "100m" / "arable", + output_dir=DATADIR / "habitat_layers" / "arable", + pixel_scale=config["pixel_scale"], + shell: + """ + mkdir -p {params.output_dir} + for d in {params.input_dir}/*.tif; do + basename=$(basename "$d") + gdalwarp \ + -t_srs EPSG:4326 \ + -tr {params.pixel_scale} -{params.pixel_scale} \ + -r average \ + -multi \ + -co COMPRESS=LZW \ + -co NUM_THREADS={threads} \ + -wo NUM_THREADS={threads} \ + "$d" \ + {params.output_dir}/"$basename" \ + 2>&1 + done 2>&1 | tee {log} + touch {output.sentinel} + """ + + +rule diff_map_arable: + """ +Generate the area difference map between current and arable habitat layers. +Used by the delta P scaling step. +""" + input: + current_sentinel=DATADIR / "habitat_layers" / "current" / ".sentinel", + arable_sentinel=DATADIR / "habitat_layers" / "arable" / ".sentinel", + output: + DATADIR / "habitat" / "arable_diff_area.tif", + log: + DATADIR / "logs" / "diff_map_arable.log", + threads: workflow.cores + params: + current_dir=DATADIR / "habitat_layers" / "current", + arable_dir=DATADIR / "habitat_layers" / "arable", + shell: + """ + mkdir -p $(dirname {output}) + python3 {SRCDIR}/prepare_layers/make_diff_map.py \ + --current {params.current_dir} \ + --scenario {params.arable_dir} \ + --output {output} \ + -j {threads} \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Restore scenario +# ============================================================================= + + +rule make_restore_map: + """ +Generate the restore scenario map at 100m resolution. +Agricultural land is restored to PNV habitat. +""" + input: + current_sentinel=DATADIR / "100m" / "current" / ".sentinel", + pnv=DATADIR / "habitat" / "pnv_raw.tif", + crosswalk=DATADIR / "crosswalk.csv", + output: + sentinel=DATADIR / "100m" / "restore" / ".sentinel", + log: + DATADIR / "logs" / "make_restore_map.log", + threads: workflow.cores + params: + current_dir=DATADIR / "100m" / "current", + output_dir=DATADIR / "100m" / "restore", + shell: + """ + python3 {SRCDIR}/prepare_layers/make_restore_map.py \ + --pnv {input.pnv} \ + --current {params.current_dir} \ + --crosswalk {input.crosswalk} \ + --output {params.output_dir} \ + -j 2 \ + 2>&1 | tee {log} + touch {output.sentinel} + """ + + +rule warp_restore: + """ +Warp the restore scenario map from 100m to the target pixel scale. +PRECIOUS: Only rebuilds if the sentinel is explicitly deleted. +""" + input: + sentinel=ancient(DATADIR / "100m" / "restore" / ".sentinel"), + output: + sentinel=DATADIR / "habitat_layers" / "restore" / ".sentinel", + log: + DATADIR / "logs" / "warp_restore.log", + threads: workflow.cores + params: + input_dir=DATADIR / "100m" / "restore", + output_dir=DATADIR / "habitat_layers" / "restore", + pixel_scale=config["pixel_scale"], + shell: + """ + mkdir -p {params.output_dir} + for d in {params.input_dir}/*.tif; do + basename=$(basename "$d") + gdalwarp \ + -t_srs EPSG:4326 \ + -tr {params.pixel_scale} -{params.pixel_scale} \ + -r average \ + -multi \ + -co COMPRESS=LZW \ + -co NUM_THREADS={threads} \ + -wo NUM_THREADS={threads} \ + "$d" \ + {params.output_dir}/"$basename" \ + 2>&1 + done 2>&1 | tee {log} + touch {output.sentinel} + """ + + +rule diff_map_restore: + """ +Generate the area difference map between current and restore habitat layers. +""" + input: + current_sentinel=DATADIR / "habitat_layers" / "current" / ".sentinel", + restore_sentinel=DATADIR / "habitat_layers" / "restore" / ".sentinel", + output: + DATADIR / "habitat" / "restore_diff_area.tif", + log: + DATADIR / "logs" / "diff_map_restore.log", + threads: workflow.cores + params: + current_dir=DATADIR / "habitat_layers" / "current", + restore_dir=DATADIR / "habitat_layers" / "restore", + shell: + """ + mkdir -p $(dirname {output}) + python3 {SRCDIR}/prepare_layers/make_diff_map.py \ + --current {params.current_dir} \ + --scenario {params.restore_dir} \ + --output {output} \ + -j {threads} \ + 2>&1 | tee {log} + """ diff --git a/workflow/rules/species.smk b/workflow/rules/species.smk new file mode 100644 index 0000000..db9aa27 --- /dev/null +++ b/workflow/rules/species.smk @@ -0,0 +1,54 @@ +# LIFE Pipeline - Species Data Extraction Rules +# ============================================== +# +# Extracts species data from the IUCN PostgreSQL database. +# Creates both current/ (presence codes 1,2) and historic/ (presence codes 1,2,4,5) +# subdirectories, each with per-species GeoJSON files and a report.csv. +# +# Environment variables required: +# DB_HOST, DB_NAME, DB_USER, DB_PASSWORD + + +# ============================================================================= +# Species Data Extraction (Checkpoint) +# ============================================================================= + + +checkpoint extract_species_data: + """ +Extract species data from PostgreSQL database. + +This is a checkpoint because the number of output GeoJSON files is only +known after extraction. Each taxa produces N species files in both current/ +and historic/ subdirectories. + +Environment variables required: + DB_HOST, DB_NAME, DB_USER, DB_PASSWORD +""" + output: + current_report=DATADIR / "species-info" / "{taxa}" / "current" / "report.csv", + historic_report=DATADIR / "species-info" / "{taxa}" / "historic" / "report.csv", + log: + DATADIR / "logs" / "extract_species_{taxa}.log", + threads: workflow.cores + resources: + db_connections=1, + params: + classname="{taxa}", + output_dir=lambda wildcards: DATADIR / "species-info" / wildcards.taxa, + projection=config["projection"], + overrides=lambda wildcards: DATADIR + / config["optional_inputs"]["species_overrides"], + shell: + """ + OVERRIDES_ARG="" + if [ -f "{params.overrides}" ]; then + OVERRIDES_ARG="--overrides {params.overrides}" + fi + python3 {SRCDIR}/prepare_species/extract_species_psql.py \ + --class {wildcards.taxa} \ + --output {params.output_dir} \ + --projection "{params.projection}" \ + $OVERRIDES_ARG \ + 2>&1 | tee {log} + """ diff --git a/workflow/rules/validation.smk b/workflow/rules/validation.smk new file mode 100644 index 0000000..48e20ec --- /dev/null +++ b/workflow/rules/validation.smk @@ -0,0 +1,163 @@ +# LIFE Pipeline - Validation Rules +# ================================= +# +# Validates AOH models for the "current" scenario only: +# +# 1. Model validation: Statistical analysis (requires R) +# 2. GBIF occurrence validation: Expensive, explicit-only + +# ============================================================================= +# Model Validation +# ============================================================================= + + +rule model_validation: + """ +Perform statistical validation of AOH models (Dahal et al. methodology). +Runs on the "current" scenario AOHs only. +""" + input: + collated=DATADIR / "aohs" / "current.csv", + version_sentinel=DATADIR / ".sentinels" / "aoh_version.txt", + output: + validation=DATADIR / "validation" / "model_validation.csv", + log: + DATADIR / "logs" / "model_validation.log", + shell: + """ + mkdir -p $(dirname {output.validation}) + aoh-validate-prevalence \ + --collated_aoh_data {input.collated} \ + --output {output.validation} \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Species Richness and Endemism (current scenario) +# ============================================================================= + + +rule species_richness: + """ +Calculate species richness from current AOH rasters. +NOT included in 'all' — use the 'summaries' target explicitly. +""" + input: + aoh_sentinel=DATADIR / "aohs" / "current.csv", + version_sentinel=DATADIR / ".sentinels" / "aoh_version.txt", + output: + richness=DATADIR / "summaries" / "species_richness.tif", + log: + DATADIR / "logs" / "species_richness.log", + threads: workflow.cores + params: + aohs_folder=DATADIR / "aohs" / "current", + shell: + """ + mkdir -p $(dirname {output.richness}) + aoh-species-richness \ + --aohs_folder {params.aohs_folder} \ + --output {output.richness} \ + 2>&1 | tee {log} + """ + + +rule endemism: + """ +Calculate endemism from current AOH rasters. +NOT included in 'all' — use the 'summaries' target explicitly. +""" + input: + aoh_sentinel=DATADIR / "aohs" / "current.csv", + species_richness=DATADIR / "summaries" / "species_richness.tif", + version_sentinel=DATADIR / ".sentinels" / "aoh_version.txt", + output: + endemism=DATADIR / "summaries" / "endemism.tif", + log: + DATADIR / "logs" / "endemism.log", + threads: workflow.cores + params: + aohs_folder=DATADIR / "aohs" / "current", + shell: + """ + aoh-endemism \ + --aohs_folder {params.aohs_folder} \ + --species_richness {input.species_richness} \ + --output {output.endemism} \ + 2>&1 | tee {log} + """ + + +# ============================================================================= +# Occurrence Validation (EXPENSIVE!) +# ============================================================================= + + +rule fetch_gbif_data: + """ +Fetch GBIF occurrence data for a taxa. +Expensive (hours) — only runs if output doesn't exist. + +Environment variables required: + GBIF_USERNAME, GBIF_EMAIL, GBIF_PASSWORD +""" + input: + collated=ancient(DATADIR / "aohs" / "current.csv"), + output: + sentinel=DATADIR / "validation" / "occurrences" / ".{taxa}_fetched", + log: + DATADIR / "logs" / "fetch_gbif_{taxa}.log", + params: + output_dir=DATADIR / "validation" / "occurrences", + shell: + """ + mkdir -p {params.output_dir} + aoh-fetch-gbif-data \ + --collated_aoh_data {input.collated} \ + --taxa {wildcards.taxa} \ + --output_dir {params.output_dir} \ + 2>&1 | tee {log} + touch {output.sentinel} + """ + + +rule validate_gbif_occurrences: + """ +Validate current AOH models against GBIF occurrence data. +""" + input: + gbif_sentinel=DATADIR / "validation" / "occurrences" / ".{taxa}_fetched", + aoh_sentinel=ancient(DATADIR / "aohs" / "current" / "{taxa}" / ".complete"), + output: + validation=DATADIR / "validation" / "occurrences" / "{taxa}.csv", + log: + DATADIR / "logs" / "validate_gbif_{taxa}.log", + params: + gbif_data=lambda wildcards: DATADIR + / "validation" + / "occurrences" + / wildcards.taxa, + species_data=lambda wildcards: DATADIR + / "species-info" + / wildcards.taxa + / "current", + aoh_results=lambda wildcards: DATADIR / "aohs" / "current" / wildcards.taxa, + shell: + """ + aoh-validate-occurrences \ + --gbif_data_path {params.gbif_data} \ + --species_data {params.species_data} \ + --aoh_results {params.aoh_results} \ + --output {output.validation} \ + 2>&1 | tee {log} + """ + + +rule occurrence_validation: + """ +Target rule for GBIF validation for all taxa. +WARNING: Expensive. Only run explicitly: snakemake occurrence_validation +""" + input: + expand(str(DATADIR / "validation" / "occurrences" / "{taxa}.csv"), taxa=TAXA),