diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8fd17a0..5724e1c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ on: jobs: build: runs-on: ubuntu-latest - container: ghcr.io/osgeo/gdal:ubuntu-small-3.10.3 + container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.4 strategy: fail-fast: false matrix: @@ -22,22 +22,31 @@ jobs: - name: Install system run: | apt-get update -qqy - apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev + apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev shellcheck - uses: actions/checkout@v4 with: submodules: 'true' + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install gdal[numpy]==3.10.3 + python -m pip install gdal[numpy]==3.11.4 python -m pip install -r requirements.txt + - name: Lint with pylint - run: | - python3 -m pylint utils prepare_layers prepare_species threats + run: python3 -m pylint utils prepare_layers prepare_species threats + + - name: Type checking with mypy + run: python3 -m mypy utils prepare_layers prepare_species threats + - name: Tests + run: python3 -m pytest ./tests + + - name: Script checks run: | - python3 -m pytest ./tests + shellcheck ./scripts/run.sh diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 30fd8a6..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "aoh-calculator"] - path = aoh-calculator - url = git@github.com:quantifyearth/aoh-calculator.git diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 0000000..d8ac83e --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +explicit_package_bases = False +no_namespace_packages = True diff --git a/Dockerfile b/Dockerfile index ade1030..913f289 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,13 +10,14 @@ WORKDIR /go/littlejohn RUN go mod tidy RUN go build -FROM ghcr.io/osgeo/gdal:ubuntu-small-3.10.0 +FROM ghcr.io/osgeo/gdal:ubuntu-small-3.11.4 RUN apt-get update -qqy && \ apt-get install -qy \ git \ cmake \ python3-pip \ + shellcheck \ r-base \ libpq-dev \ libtirpc-dev \ @@ -27,7 +28,7 @@ COPY --from=reclaimerbuild /go/reclaimer/reclaimer /bin/reclaimer COPY --from=littlejohnbuild /go/littlejohn/littlejohn /bin/littlejohn RUN rm /usr/lib/python3.*/EXTERNALLY-MANAGED -RUN pip install gdal[numpy]==3.10.0 +RUN pip install gdal[numpy]==3.11.4 COPY requirements.txt /tmp/ RUN pip install -r /tmp/requirements.txt @@ -53,3 +54,5 @@ ENV PYTHONPATH=/root/star RUN python3 -m pytest ./tests RUN python3 -m pylint prepare_layers prepare_species utils tests +RUN python3 -m mypy prepare_layers prepare_species utils tests +RUN shellcheck ./scripts/run.sh diff --git a/README.md b/README.md index 93895b4..bc552b6 100644 --- a/README.md +++ b/README.md @@ -4,40 +4,41 @@ An implementation of the threat based [STAR biodiversity metric by Muir et al](h See [method.md](method.md) for a description of the methodology, or `scripts/run.sh` for how to execute the pipeline. -# Running the pipeline - ## Checking out the code -This repository uses submodules, so once you have cloned it, you need to fetch the submodules: +The code is available on github, and can be checked out from there: ```shell -$ git clone https://github.com/quantifyearth/star.git -$ cd star -$ git submodule update --init --recursive +$ git clone https://github.com/quantifyearth/STAR.git +... +$ cd STAR ``` -## Running the pipeline +## Additional inputs + +There are some additional inputs required to run the pipeline, which should be placed in the directory you use to store the pipeline results. -The easiest way to get started will be to run `scripts/run.sh` under a linux environment. +* SpeciesList_generalisedRangePolygons.csv - A list of species with generalised ranges on the IUCN Redlist. +* BL_Species_Elevations_2023.csv (optional) - corrections to the elevation of birdlife species on the IUCN Redlist taken from the BirdLife data. -### Running on Ubuntu +The script also assumes you have a Postgres database with the IUCN Redlist database in it. + +## Running the pipeline -The following extra utilities will need to be installed: +There are two ways to run the pipeline. The easiest way is to use Docker if you have it available to you, as it will manage all the dependencies for you. But you can check out and run it locally if you want to also, but it requires a little more effort. -* [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a utility for downloading data from various primary sources. -* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a utility to run jobs in parallel driven by a CSV file. +### Running with Docker -### Running in Docker There is included a docker file, which is based on the GDAL container image, which is set up to install everything ready to use. You can build that using: -``` +```shell $ docker buildx build -t star . ``` You can then invoke the run script using this. You should map an external folder into the container as a place to store the intermediary data and final results, and you should provide details about the Postgres instance with the IUCN redlist: -``` +```shell $ docker run --rm -v /some/local/dir:/data \ -e DB_HOST=localhost \ -e DB_NAME=iucnredlist \ @@ -46,6 +47,66 @@ $ docker run --rm -v /some/local/dir:/data \ star ./scripts/run.sh ``` +### Running without Docker + +If you prefer not to use Docker, you will need: + +* Python3 >= 3.10 +* GDAL +* R (required for validation) +* [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a Go tool for fetching data from Zenodo +* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a Go tool for running scripts in parallel + +If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)). + +With those you should set up a Python virtual environment to install all the required packages. The one trick to this is you need to match the Python GDAL package to your installed GDAL version. For example, on my machine I did the following: + +```shell +$ python3 -m venv ./venv +$ . ./venv/bin/activate +(venv) $ gdalinfo --version +GDAL 3.11.3 "Eganville", released 2025/07/12 +(venv) $ pip install gdal[numpy]==3.11.3 +... +(venv) $ pip install -r requirements.txt +``` + +You will also need to install the R stats packages required for the validation stage: + +```shell +$ R -e "install.packages(c('lme4', 'lmerTest'), repos='https://cran.rstudio.com/')" +``` + +Before running the pipeline you will need to set several environmental variables to tell the script where to store data and where the database with the IUCN Redlist is. You can set these manually, or we recommend using a tool like [direnv](https://direnv.net). + +```shell +export DATADIR=[PATH WHERE YOU WANT THE RESULTS] +export DB_HOST=localhost +export DB_NAME=iucnredlist +export DB_PASSWORD=supersecretpassword +export DB_USER=postgres +``` + +Once you have all that you can then run the pipeline: + +```shell +(venv) $ ./scripts/run.sh +``` + # Credits The author of this package is greatly indebted to both [Francesca Ridley](https://www.ncl.ac.uk/nes/people/profile/francescaridley.html) from the University of Newcastle and [Simon Tarr](https://www.linkedin.com/in/simon-tarr-22069b209/) of the IUCN for their guidance and review. + +## Data Attribution + +The crosswalk table `data/crosswalk_bin_T.csv` was created by [Francesca Ridley](https://www.ncl.ac.uk/nes/people/profile/francescaridley.html) and is derived from: + +``` +Lumbierres, M., Dahal, P.R., Di Marco, M., Butchart, S.H.M., Donald, P.F., +& Rondinini, C. (2022). Translating habitat class to land cover to map area +of habitat of terrestrial vertebrates. Conservation Biology, 36, e13851. +https://doi.org/10.1111/cobi.13851 +``` + +The paper is licensed under CC BY-NC. It is used in this STAR implementation to crosswalk between the IUCN Habitat classes in the Redlist and the land classes in the Copernicus data layers. + diff --git a/aoh-calculator b/aoh-calculator deleted file mode 160000 index c24def9..0000000 --- a/aoh-calculator +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c24def960799f170a9812af31d4c0e2dc5940dbf diff --git a/data/crosswalk_bin_T.csv b/data/crosswalk_bin_T.csv new file mode 100644 index 0000000..f068cf8 --- /dev/null +++ b/data/crosswalk_bin_T.csv @@ -0,0 +1,18 @@ +CGLS100_name,CGLS100_value,Label,H_1,H_2,H_3,H_4,H_5,H_6,H_7,H_8,H_14.1,H_14.2,H_14.3,H_14.6,H_14.4,H_14.5,H_15 +CLS_20_shrubs,20,shrubs,0,1,1,0,0,0,U,1,0,0,0,0,0,0,0 +CLS_30_Herbaceous_vegetation,30,Herbaceous_vegetation,0,0,0,1,0,0,U,0,0,0,0,0,0,0,0 +CLS_40_CultivatedandManaged_VegetationAgriculture,40,CultivatedandManaged_VegetationAgriculture,0,0,0,1,1,0,U,0,1,1,0,0,0,0,0 +CLS_50_Urban_builtup,50,Urban_builtup,0,0,0,0,0,0,U,0,0,0,0,0,1,1,0 +CLS_60_bare_sparsevegetation,60,bare_sparsevegetation,0,0,1,0,0,1,U,1,0,0,0,0,0,0,0 +CLS_80_permanent_water,80,permanent_water,0,0,0,0,1,0,U,0,0,0,0,0,0,0,0 +CLS_90_Herbaceous_wetland,90,Herbaceous_wetland,0,0,0,0,1,0,U,0,0,0,0,0,0,0,1 +CLS_111_Closedforest_evergreen_needle,111,Closedforest_evergreen_needle,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_112_Closedforest_evergreen_broad,112,Closedforest_evergreen_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_114_Closedforest_deciduous_broad,114,Closedforest_deciduous_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_115_Closedforest_mixed,115,Closedforest_mixed,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_116_Closedforest_unknown,116,Closedforest_unknown,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_121_Openforest_evergreen_needle,121,Openforest_evergreen_needle,1,0,0,0,0,1,U,0,0,0,0,0,0,0,0 +CLS_122_Openforest_evergreen_broad,122,Openforest_evergreen_broad,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_124_Openforest_deciduous_broad,124,Openforest_deciduous_broad,0,1,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_125_Openforest_mixed,125,Openforest_mixed,1,0,0,0,0,0,U,0,0,0,0,0,0,0,0 +CLS_126_Openforest_unknown,126,Openforest_unknown,0,0,0,0,0,0,U,0,0,0,0,0,0,0,0 diff --git a/method.md b/method.md index f9d3fce..06daf8d 100644 --- a/method.md +++ b/method.md @@ -120,8 +120,12 @@ python3 ./prepare_layers/make_masks.py --habitat_layers /data/habitat_layers/cur To assist with provenance, we download the data from the Zenodo ID. ```shark-run:reclaimer -curl -o FABDEM.zip https://data.bris.ac.uk/datasets/tar/s5hqmjcdj8yo2ibzi9b4ew3sn.zip -... +curl -o /data/FABDEM.zip https://data.bris.ac.uk/datasets/tar/s5hqmjcdj8yo2ibzi9b4ew3sn.zip +``` + +```shark-run:gdalonly +python3 tbd.py --input /data/FABDEM.zip \ + --output /data/elevation.tif ``` Similarly to the habitat map we need to resample to 1km, however rather than picking the mean elevation, we select both the min and max elevation for each pixel, and then check whether the species is in that range when we calculate AoH. @@ -214,4 +218,18 @@ python3 ./aoh-calculator/validation/validate_map_prevelence.py --collated_aoh_da ```shark-publish /data/validation/model_validation.csv -``` \ No newline at end of file +``` + +## Threats + +```shark-run:aohbuilder +python3 ./threats/threat_processing.py --speciesdata /data/species-info/* \ + --aoh /data/aohs/ \ + --output /data/threat_rasters + +python3 ./threats/threat_summation.py --threat_rasters /data/threat_rasters --output /data/threat_results +``` + +```shark-publish +/data/threat_results +``` diff --git a/prepare_layers/convert_crosswalk.py b/prepare_layers/convert_crosswalk.py index 37926ff..4049d6e 100644 --- a/prepare_layers/convert_crosswalk.py +++ b/prepare_layers/convert_crosswalk.py @@ -1,4 +1,5 @@ import argparse +from pathlib import Path import pandas as pd @@ -28,8 +29,8 @@ } def convert_crosswalk( - original_path: str, - output_path: str, + original_path: Path, + output_path: Path, ) -> None: original = pd.read_csv(original_path) @@ -56,14 +57,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Convert IUCN crosswalk to minimal common format.") parser.add_argument( '--original', - type=str, + type=Path, help="Original format", required=True, dest="original_path", ) parser.add_argument( '--output', - type=str, + type=Path, help='Destination minimal file', required=True, dest='output_path', diff --git a/prepare_layers/make_masks.py b/prepare_layers/make_masks.py index f5d9756..2557280 100644 --- a/prepare_layers/make_masks.py +++ b/prepare_layers/make_masks.py @@ -1,62 +1,57 @@ import argparse import os import sys -from glob import glob +from pathlib import Path from typing import Set -import numpy as np -from yirgacheffe.layers import RasterLayer +import yirgacheffe as yg +import yirgacheffe.operators as yo OPEN_SEA_LCC = "lcc_200.tif" NO_DATA_LCC = "lcc_0.tif" def prepare_mask( - layers: Set[str], - output_path: str, + layers: Set[Path], + output_path: Path, at_least: bool = True, ) -> None: assert layers - rasters = [RasterLayer.layer_from_file(x) for x in layers] - - intersection = RasterLayer.find_intersection(rasters) - for r in rasters: - r.set_window_for_intersection(intersection) + rasters = [yg.read_raster(x) for x in layers] calc = rasters[0] for r in rasters[1:]: calc = calc + r if at_least: - calc = calc.numpy_apply(lambda a: np.where(a >= 0.5, 1.0, 0.0)) + calc = yo.where(calc >= 0.5, 1.0, 0.0) else: - calc = calc.numpy_apply(lambda a: np.where(a > 0.5, 1.0, 0.0)) + calc = yo.where(calc > 0.5, 1.0, 0.0) - with RasterLayer.empty_raster_layer_like(rasters[0], filename=output_path) as result: - calc.parallel_save(result) + calc.to_geotiff(output_path, parallelism=128) def prepare_masks( - habitat_layers_path: str, - output_directory_path: str, + habitat_layers_path: Path, + output_directory_path: Path, ) -> None: os.makedirs(output_directory_path, exist_ok=True) - layer_files = set(glob("lcc_*.tif", root_dir=habitat_layers_path)) + layer_files = set(habitat_layers_path.glob("lcc_*.tif")) if not layer_files: sys.exit(f"Found no habitat layers in {habitat_layers_path}") - marine_layers = layer_files & set([OPEN_SEA_LCC]) - terrerstrial_layers = layer_files - set([OPEN_SEA_LCC, NO_DATA_LCC]) + marine_layers = {x for x in layer_files if x.name == OPEN_SEA_LCC} + terrerstrial_layers = {x for x in layer_files if x.name not in [OPEN_SEA_LCC, NO_DATA_LCC]} assert len(marine_layers) == 1 - assert len(terrerstrial_layers) == len(layer_files) - 2 + assert len(terrerstrial_layers) < len(layer_files) prepare_mask( - {os.path.join(habitat_layers_path, x) for x in marine_layers}, - os.path.join(output_directory_path, "marine_mask.tif"), + marine_layers, + output_directory_path / "marine_mask.tif", ) prepare_mask( - {os.path.join(habitat_layers_path, x) for x in terrerstrial_layers}, - os.path.join(output_directory_path, "terrestrial_mask.tif"), + terrerstrial_layers, + output_directory_path / "terrestrial_mask.tif", at_least=True, ) @@ -66,14 +61,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Generate terrestrial and marine masks.") parser.add_argument( '--habitat_layers', - type=str, + type=Path, help="directory with split and scaled habitat layers", required=True, dest="habitat_layers" ) parser.add_argument( '--output_directory', - type=str, + type=Path, help="Folder for output mask layers", required=True, dest="output_directory" diff --git a/prepare_species/apply_birdlife_data.py b/prepare_species/apply_birdlife_data.py new file mode 100644 index 0000000..da65746 --- /dev/null +++ b/prepare_species/apply_birdlife_data.py @@ -0,0 +1,78 @@ +import argparse +import math +from pathlib import Path + +import aoh +import geopandas as gpd +import pandas as pd + +# Columns from current BirdLife data overrides: +# SIS ID +# Assessment ID +# WBDB ID +# Sequence +# Scientific name +# Common name +# RL Category +# PE +# PEW +# Min altitude (m) +# Max altitude (m) +# Occasional lower elevation +# Occasional upper elevation + +def apply_birdlife_data( + geojson_directory_path: Path, + overrides_path: Path, +) -> None: + overrides = pd.read_csv(overrides_path, encoding="latin1") + + for _, row in overrides.iterrows(): + if math.isnan(row["Occasional lower elevation"]) and math.isnan(row["Occasional upper elevation"]): + continue + + path = geojson_directory_path / "AVES" / "current" / f'{row["SIS ID"]}.geojson' + if not path.exists(): + continue + + species_info = gpd.read_file(path) + data = species_info.loc[0].copy() + + if not math.isnan(row["Occasional lower elevation"]): + data.elevation_lower = float(row["Occasional lower elevation"]) + else: + data.elevation_lower = float(data.elevation_lower) + if not math.isnan(row["Occasional upper elevation"]): + data.elevation_upper = float(row["Occasional upper elevation"]) + else: + data.elevation_upper = float(data.elevation_upper) + data = aoh.tidy_data(data) + + res = gpd.GeoDataFrame(data.to_frame().transpose(), crs=species_info.crs, geometry="geometry") + res.to_file(path, driver="GeoJSON") + +def main() -> None: + parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.") + parser.add_argument( + '--geojsons', + type=Path, + help='Directory where per species Geojson is stored', + required=True, + dest='geojson_directory_path', + ) + parser.add_argument( + '--overrides', + type=Path, + help="CSV of overrides", + required=True, + dest="overrides", + ) + args = parser.parse_args() + + apply_birdlife_data( + args.geojson_directory_path, + args.overrides + ) + +if __name__ == "__main__": + main() diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py index 0ceda78..43b65c7 100644 --- a/prepare_species/extract_species_data_psql.py +++ b/prepare_species/extract_species_data_psql.py @@ -1,14 +1,14 @@ import argparse -import importlib import json import logging import math import os from functools import partial from multiprocessing import Pool -from typing import Any, List, Optional, Set, Tuple +from pathlib import Path +from typing import Any, Optional -# import pyshark # pylint: disable=W0611 +import aoh import geopandas as gpd import pandas as pd import pyproj @@ -16,14 +16,13 @@ import shapely from postgis.psycopg import register -aoh_cleaning = importlib.import_module("aoh-calculator.cleaning") logger = logging.getLogger(__name__) logging.basicConfig() logger.setLevel(logging.DEBUG) # To match the FABDEM elevation map we use -# different range min/max/seperation +# different range min/max/separation ELEVATION_MAX = 8580 ELEVATION_MIN = -427 ELEVATION_SPREAD = 12 @@ -31,6 +30,7 @@ COLUMNS = [ "id_no", "assessment_id", + "assessment_year", "season", "systems", "elevation_lower", @@ -61,6 +61,7 @@ SELECT assessments.sis_taxon_id as id_no, assessments.id as assessment_id, + DATE_PART('year', assessments.assessment_date) as assessment_year, assessments.possibly_extinct, assessments.possibly_extinct_in_the_wild, (assessment_supplementary_infos.supplementary_fields->>'ElevationLower.limit')::numeric AS elevation_lower, @@ -180,36 +181,36 @@ def __getattr__(self, name: str) -> Any: return self.info[name] return None - def as_row(self) -> List: + def as_row(self) -> list: return [self.info[k] for k in self.REPORT_COLUMNS] def tidy_reproject_save( gdf: gpd.GeoDataFrame, report: SpeciesReport, - output_directory_path: str, + output_directory_path: Path, target_projection: Optional[str], ) -> None: src_crs = pyproj.CRS.from_epsg(4326) target_crs = pyproj.CRS.from_string(target_projection) if target_projection else src_crs graw = gdf.loc[0].copy() - grow = aoh_cleaning.tidy_data( + grow = aoh.tidy_data( graw, elevation_max=ELEVATION_MAX, elevation_min=ELEVATION_MIN, elevation_seperation=ELEVATION_SPREAD, ) os.makedirs(output_directory_path, exist_ok=True) - output_path = os.path.join(output_directory_path, f"{grow.id_no}.geojson") + output_path = output_directory_path / f"{grow.id_no}.geojson" res = gpd.GeoDataFrame(grow.to_frame().transpose(), crs=src_crs, geometry="geometry") res_projected = res.to_crs(target_crs) res_projected.to_file(output_path, driver="GeoJSON") report.filename = output_path def process_systems( - systems_data: List[Tuple], + systems_data: list[tuple], report: SpeciesReport, -) -> None: +) -> list: if len(systems_data) == 0: raise ValueError("No systems found") if len(systems_data) > 1: @@ -249,9 +250,9 @@ def process_systems( ] def process_threats( - threat_data: List, + threat_data: list[tuple[int, str, str]], report: SpeciesReport, -) -> bool: +) -> list[tuple[int, int]]: cleaned_threats = [] for code, scope, severity in threat_data: if scope is None or scope.lower() == "unknown": @@ -267,9 +268,9 @@ def process_threats( return cleaned_threats def process_habitats( - habitats_data: List[List[str]], + habitats_data: list[list[str]], report: SpeciesReport, -) -> Set: +) -> set: if len(habitats_data) == 0: # Promote to "Unknown" habitats_data = [["18"]] @@ -295,7 +296,7 @@ def process_habitats( return habitats def process_geometries( - geometries_data: List[Tuple[int,shapely.Geometry]], + geometries_data: list[tuple[int, shapely.Geometry]], report: SpeciesReport, ) -> shapely.Geometry: if len(geometries_data) == 0: @@ -326,17 +327,17 @@ def process_geometries( def process_row( class_name: str, - output_directory_path: str, + output_directory_path: Path, target_projection: Optional[str], - presence: Tuple[int], - row: Tuple, -) -> Tuple: + presence: tuple[int, ...], + row: tuple, +) -> SpeciesReport: connection = psycopg2.connect(DB_CONFIG) register(connection) cursor = connection.cursor() - id_no, assessment_id, possibly_extinct, possibly_extinct_in_the_wild, \ + id_no, assessment_id, assessment_year, possibly_extinct, possibly_extinct_in_the_wild, \ elevation_lower, elevation_upper, scientific_name, family_name, category = row report = SpeciesReport(id_no, assessment_id, scientific_name) @@ -378,6 +379,7 @@ def process_row( [[ id_no, assessment_id, + int(assessment_year), "all", systems, int(elevation_lower) if elevation_lower is not None else None, @@ -398,7 +400,7 @@ def process_row( return report def apply_overrides( - overrides_path: str, + overrides_path: Path, results, ): overrides = pd.read_csv(overrides_path, encoding="latin1") @@ -425,16 +427,16 @@ def apply_overrides( def extract_data_per_species( class_name: str, - overrides_path: Optional[str], - excludes_path: Optional[str], - output_directory_path: str, + overrides_path: Optional[Path], + excludes_path: Optional[Path], + output_directory_path: Path, target_projection: Optional[str], ) -> None: connection = psycopg2.connect(DB_CONFIG) cursor = connection.cursor() - excludes = tuple([]) + excludes: tuple = tuple([]) if excludes_path is not None: try: df = pd.read_csv(excludes_path) @@ -446,7 +448,7 @@ def extract_data_per_species( # For STAR-R we need historic data, but for STAR-T we just need current. # for era, presence in [("current", (1, 2)), ("historic", (1, 2, 4, 5))]: for era, presence in [("current", (1, 2))]: - era_output_directory_path = os.path.join(output_directory_path, era) + era_output_directory_path = output_directory_path / era # You can't do NOT IN on an empty list in SQL if excludes: @@ -471,17 +473,13 @@ def extract_data_per_species( partial(process_row, class_name, era_output_directory_path, target_projection, presence), results ) - # reports = [ - # process_row(class_name, era_output_directory_path, target_projection, presence, x) - # for x in results[:10] - # ] reports_df = pd.DataFrame( [x.as_row() for x in reports], columns=SpeciesReport.REPORT_COLUMNS ).sort_values('id_no') os.makedirs(era_output_directory_path, exist_ok=True) - reports_df.to_csv(os.path.join(era_output_directory_path, "report.csv"), index=False) + reports_df.to_csv(era_output_directory_path / "report.csv", index=False) def main() -> None: parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.") @@ -494,21 +492,21 @@ def main() -> None: ) parser.add_argument( '--overrides', - type=str, + type=Path, help="CSV of overrides", required=False, dest="overrides", ) parser.add_argument( '--excludes', - type=str, + type=Path, help="CSV of taxon IDs to not include", required=False, dest="excludes" ) parser.add_argument( '--output', - type=str, + type=Path, help='Directory where per species GeoJSON is stored', required=True, dest='output_directory_path', diff --git a/requirements.txt b/requirements.txt index 7c9a8ef..720c71a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -alive-progress numpy geopandas postgis @@ -7,15 +6,12 @@ psutil pymer4 pyproj scikit-image -requests -zenodo_search +yirgacheffe>=1.9 +aoh[validation]>=1.0 +# GDAL should be installed manually to match the version of the library installed on your machine gdal[numpy] -git+https://github.com/quantifyearth/iucn_modlib -git+https://github.com/quantifyearth/pyshark -git+https://github.com/quantifyearth/yirgacheffe@4a2cab77f4a64e3f09497ee7098dc9ba499cda90 - pylint mypy pytest diff --git a/scripts/run.sh b/scripts/run.sh index 2ce4863..d88e05b 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -7,7 +7,34 @@ # https://github.com/quantifyearth/reclaimer - used to download inputs from Zenodo directly # https://github.com/quantifyearth/littlejohn - used to run batch jobs in parallel +# Set shell script to exit on first error (-e) and to output commands being run to make +# reviewing logs easier (-x) set -e +set -x + +# We know we use two Go tools, so add go/bin to our path as in slurm world they're likely +# to be installed locally +export PATH="${PATH}":"${HOME}"/go/bin +if ! hash littlejohn 2>/dev/null; then + echo "Please ensure littlejohn is available" + exit 1 +fi +if ! hash reclaimer 2>/dev/null; then + echo "Please ensure reclaimer is available" + exit 1 +fi + +# Detect if we're running under SLURM +if [[ -n "${SLURM_JOB_ID}" ]]; then + # Slurm users will probably need to customise this + # shellcheck disable=SC1091 + source "${HOME}"/venvs/star/bin/activate + cd "${HOME}"/dev/star + PROCESS_COUNT="${SLURM_JOB_CPUS_PER_NODE}" +else + PROCESS_COUNT=$(getconf _NPROCESSORS_ONLN) +fi +echo "Using ${PROCESS_COUNT} threads." if [ -z "${DATADIR}" ]; then echo "Please specify $DATADIR" @@ -19,96 +46,100 @@ if [ -z "${VIRTUAL_ENV}" ]; then exit 1 fi -export CPUS=`getconf _NPROCESSORS_ONLN` -export THREADS=$(($CPUS / 2)) -echo "Using $THREADS threads." - declare -a TAXALIST=("AMPHIBIA" "AVES" "MAMMALIA" "REPTILIA") +if [ ! -d "${DATADIR}" ]; then + mkdir "${DATADIR}" +fi + # Get habitat layer and prepare for use -if [ ! -d ${DATADIR}/habitat_layers ]; then - if [ ! -f ${DATADIR}/habitat/raw.tif ]; then +if [ ! -d "${DATADIR}"/habitat_layers ]; then + if [ ! -f "${DATADIR}"/habitat/raw.tif ]; then echo "Fetching habitat map..." - reclaimer zenodo --zenodo_id 3939050 --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif --output ${DATADIR}/habitat/raw.tif + reclaimer zenodo --zenodo_id 3939050 \ + --filename PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif \ + --output "${DATADIR}"/habitat/raw.tif fi echo "Processing habitat map..." - python3 ./aoh-calculator/habitat_process.py --habitat ${DATADIR}/habitat/raw.tif \ - --scale 1000.0 \ - --projection "ESRI:54009" \ - --output ${DATADIR}/tmp_habitat_layers/current - mv ${DATADIR}/tmp_habitat_layers ${DATADIR}/habitat_layers + aoh-habitat-process --habitat "${DATADIR}"/habitat/raw.tif \ + --scale 1000.0 \ + --projection "ESRI:54009" \ + --output "${DATADIR}"/tmp_habitat_layers/current + mv "${DATADIR}"/tmp_habitat_layers "${DATADIR}"/habitat_layers fi -if [ ! -d ${DATADIR}/masks ]; then +if [ ! -d "${DATADIR}"/masks ]; then echo "Processing masks..." - python3 ./prepare_layers/make_masks.py --habitat_layers ${DATADIR}/habitat_layers/current \ - --output_directory ${DATADIR}/masks + python3 ./prepare_layers/make_masks.py --habitat_layers "${DATADIR}"/habitat_layers/current \ + --output_directory "${DATADIR}"/masks fi # Fetch and prepare the elevation layers -if [[ ! -f ${DATADIR}/elevation/elevation-max-1k.tif || ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]]; then - if [ ! -f ${DATADIR}/elevation/elevation.tif ]; then +if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then + if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then echo "Fetching elevation map..." - reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output ${DATADIR}/elevation/elevation.tif + reclaimer zenodo --zenodo_id 5719984 --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif fi - if [ ! -f ${DATADIR}/elevation/elevation-max-1k.tif ]; then + if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then echo "Generating elevation max layer..." - gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-max-1k.tif + gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-max-1k.tif fi - if [ ! -f ${DATADIR}/elevation/elevation-min-1k.tif ]; then + if [ ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]; then echo "Generating elevation min layer..." - gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 ${DATADIR}/elevation/elevation.tif ${DATADIR}/elevation/elevation-min-1k.tif + gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-min-1k.tif fi fi # Generate the crosswalk table -if [ ! -f ${DATADIR}/crosswalk.csv ]; then +if [ ! -f "${DATADIR}"/crosswalk.csv ]; then echo "Generating crosswalk table..." - python3 ./prepare_layers/convert_crosswalk.py --original ${PWD}/data/crosswalk_bin_T.csv --output ${DATADIR}/crosswalk.csv + python3 ./prepare_layers/convert_crosswalk.py --original ./data/crosswalk_bin_T.csv --output "${DATADIR}"/crosswalk.csv fi # Get species data per taxa from IUCN data for TAXA in "${TAXALIST[@]}" do - echo "Extracting species data for ${TAXA}..." - python3 ./prepare_species/extract_species_data_psql.py --class ${TAXA} --output ${DATADIR}/species-info/${TAXA}/ --projection "ESRI:54009" --excludes ${DATADIR}/SpeciesList_generalisedRangePolygons.csv + if [ ! -d "${DATADIR}"/species-info/"${TAXA}"/ ]; then + echo "Extracting species data for ${TAXA}..." + python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" --output "${DATADIR}"/species-info/"${TAXA}"/ --projection "ESRI:54009" --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv + fi done -if [ -f data/BL_Species_Elevations_2023.csv ]; then +if [ -f "${DATADIR}"/BL_Species_Elevations_2023.csv ]; then echo "Applying birdlife data..." - python3 ./prepare_species/apply_birdlife_data.py --geojsons ${DATADIR}/species-info/AVES --overrides data/BL_Species_Elevations_2023.csv + python3 ./prepare_species/apply_birdlife_data.py --geojsons "${DATADIR}"/species-info/AVES --overrides "${DATADIR}"/BL_Species_Elevations_2023.csv fi echo "Generating AoH task list..." -python3 ./utils/aoh_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/aohbatch.csv +python3 ./utils/aoh_generator.py --input "${DATADIR}"/species-info --datadir "${DATADIR}" --output "${DATADIR}"/aohbatch.csv echo "Generating AoHs..." -littlejohn -j ${THREADS} -o ${DATADIR}/aohbatch.log -c ${DATADIR}/aohbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./aoh-calculator/aohcalc.py +littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/aohbatch.log -c "${DATADIR}"/aohbatch.csv "${VIRTUAL_ENV}"/bin/aoh-calc # Calculate predictors from AoHs echo "Generating species richness..." -python3 ./aoh-calculator/summaries/species_richness.py --aohs_folder ${DATADIR}/aohs/current/ \ - --output ${DATADIR}/summaries/species_richness.tif +aoh-species-richness --aohs_folder "${DATADIR}"/aohs/current/ \ + --output "${DATADIR}"/summaries/species_richness.tif echo "Generating endemism..." -python3 ./aoh-calculator/summaries/endemism.py --aohs_folder ${DATADIR}/aohs/current/ \ - --species_richness ${DATADIR}/summaries/species_richness.tif \ - --output ${DATADIR}/summaries/endemism.tif +aoh-endemism --aohs_folder "${DATADIR}"/aohs/current/ \ + --species_richness "${DATADIR}"/summaries/species_richness.tif \ + --output "${DATADIR}"/summaries/endemism.tif # Aoh Validation echo "Collating validation data..." -python3 ./aoh-calculator/validation/collate_data.py --aoh_results ${DATADIR}/aohs/current/ \ - --output ${DATADIR}/validation/aohs.csv +aoh-collate-data --aoh_results "${DATADIR}"/aohs/current/ \ + --output "${DATADIR}"/validation/aohs.csv echo "Calculating model validation..." -python3 ./aoh-calculator/validation/validate_map_prevalence.py --collated_aoh_data ${DATADIR}/validation/aohs.csv \ - --output ${DATADIR}/validation/model_validation.csv +aoh-validate-prevalence --collated_aoh_data "${DATADIR}"/validation/aohs.csv \ + --output "${DATADIR}"/validation/model_validation.csv # Threats echo "Generating threat task list..." -python3 ./utils/threats_generator.py --input ${DATADIR}/species-info --datadir ${DATADIR} --output ${DATADIR}/threatbatch.csv +python3 ./utils/threats_generator.py --input "${DATADIR}"/species-info --datadir "${DATADIR}" --output "${DATADIR}"/threatbatch.csv echo "Generating threat rasters..." -littlejohn -j ${THREADS} -o ${DATADIR}/threatbatch.log -c ${DATADIR}/threatbatch.csv ${VIRTUAL_ENV}/bin/python3 -- ./threats/threat_processing.py +littlejohn -j "${PROCESS_COUNT}" -o "${DATADIR}"/threatbatch.log -c "${DATADIR}"/threatbatch.csv "${VIRTUAL_ENV}"/bin/python3 -- ./threats/threat_processing.py echo "Summarising threats..." -python3 ./threats/threat_summation.py --threat_rasters ${DATADIR}/threat_rasters --output ${DATADIR}/threat_results +python3 ./threats/threat_summation.py --threat_rasters "${DATADIR}"/threat_rasters --output "${DATADIR}"/threat_results diff --git a/threats/threat_processing.py b/threats/threat_processing.py index 2a0adca..0e04c73 100644 --- a/threats/threat_processing.py +++ b/threats/threat_processing.py @@ -2,22 +2,23 @@ import json import os import sys +from pathlib import Path import geopandas as gpd +import yirgacheffe as yg from pyogrio.errors import DataSourceError -from yirgacheffe.layers import RasterLayer def threat_processing_per_species( - species_data_path: str, - aoh_path: str, - output_directory_path: str, + species_data_path: Path, + aoh_path: Path, + output_directory_path: Path, ) -> None: try: data = gpd.read_file(species_data_path) except DataSourceError: sys.exit(f"Failed to read {species_data_path}") - with RasterLayer.layer_from_file(aoh_path) as aoh: + with yg.read_raster(aoh_path) as aoh: os.makedirs(output_directory_path, exist_ok=True) @@ -26,8 +27,7 @@ def threat_processing_per_species( threat_data = json.loads(data.threats[0]) try: - aoh_base, _ = os.path.splitext(aoh_path) - aoh_data_path = aoh_base + ".json" + aoh_data_path = aoh_path.with_suffix(".json") with open(aoh_data_path, "r", encoding="UTF-8") as f: aoh_data = json.load(f) aoh_total = aoh_data["aoh_total"] @@ -46,11 +46,10 @@ def threat_processing_per_species( per_threat_per_species_score = weighted_species * proportional_threat_weight print(per_threat_per_species_score.sum()) - threat_dir_path = os.path.join(output_directory_path, str(threat_id)) + threat_dir_path = output_directory_path / str(threat_id) os.makedirs(threat_dir_path, exist_ok=True) - output_path = os.path.join(threat_dir_path, f"{taxon_id}.tif") - with RasterLayer.empty_raster_layer_like(aoh, filename=output_path) as result: - per_threat_per_species_score.save(result) + output_path = threat_dir_path / f"{taxon_id}.tif" + per_threat_per_species_score.to_geotiff(output_path) def main() -> None: os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0" @@ -58,21 +57,21 @@ def main() -> None: parser = argparse.ArgumentParser(description="Calculate per species threat layers") parser.add_argument( '--speciesdata', - type=str, + type=Path, help="Single species/seasonality geojson.", required=True, dest="species_data_path" ) parser.add_argument( '--aoh', - type=str, + type=Path, help="AoH raster of speices.", required=True, dest="aoh_path" ) parser.add_argument( '--output', - type=str, + type=Path, help='Directory where per species/threat layers are stored', required=True, dest='output_directory_path', diff --git a/threats/threat_summation.py b/threats/threat_summation.py index 54bf6a5..5b93ede 100644 --- a/threats/threat_summation.py +++ b/threats/threat_summation.py @@ -5,20 +5,17 @@ import time from multiprocessing import Manager, Process, Queue, cpu_count from pathlib import Path -from typing import List -from yirgacheffe.layers import RasterLayer # type: ignore +import yirgacheffe as yg +from yirgacheffe.layers import RasterLayer from osgeo import gdal gdal.SetCacheMax(1024 * 1024 * 32) def worker( - filename: str, - result_dir: str, + output_tif: Path, input_queue: Queue, ) -> None: - output_tif = os.path.join(result_dir, filename) - merged_result = None while True: @@ -26,7 +23,7 @@ def worker( if path is None: break - with RasterLayer.layer_from_file(path) as partial_raster: + with yg.read_raster(path) as partial_raster: if merged_result is None: merged_result = RasterLayer.empty_raster_layer_like(partial_raster) cleaned_raster = partial_raster.nan_to_num() @@ -38,24 +35,22 @@ def worker( merged_result = temp if merged_result: - final = RasterLayer.empty_raster_layer_like(merged_result, filename=output_tif) - merged_result.save(final) + merged_result.to_geotiff(output_tif) def raster_sum( - images_list: List[Path], - output_filename: str, + images_list: list[Path], + output_filename: Path, processes_count: int ) -> None: - result_dir, filename = os.path.split(output_filename) - os.makedirs(result_dir, exist_ok=True) + os.makedirs(output_filename.parent, exist_ok=True) - with tempfile.TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir_str: + tempdir = Path(tempdir_str) with Manager() as manager: source_queue = manager.Queue() workers = [Process(target=worker, args=( - f"{index}.tif", - tempdir, + tempdir / f"{index}.tif", source_queue )) for index in range(processes_count)] for worker_process in workers: @@ -80,8 +75,7 @@ def raster_sum( # here we should have now a set of images in tempdir to merge single_worker = Process(target=worker, args=( - filename, - result_dir, + output_filename, source_queue )) single_worker.start() @@ -103,17 +97,17 @@ def raster_sum( time.sleep(1) def reduce_to_next_level( - rasters_directory: str, - output_directory: str, + rasters_directory: Path, + output_directory: Path, processes_count: int, ) -> None: - files = list(Path(rasters_directory).glob("**/*.tif")) + files = list(rasters_directory.glob("**/*.tif")) print(f"total items: {len(files)}") if not files: sys.exit(f"No files in {rasters_directory}, aborting") - buckets = {} + buckets: dict[str,list[Path]] = {} for filename in files: code, _ = os.path.splitext(filename.name) next_level_threat_id = ".".join(code.split('.')[:-1]) @@ -126,22 +120,22 @@ def reduce_to_next_level( print(f"Found {len(buckets)} threats at current level:") for code, files in buckets.items(): - target_output = os.path.join(output_directory, f"{code}.tif") + target_output = output_directory / f"{code}.tif" print(f"processing {code}: {len(files)} items") raster_sum(files, target_output, processes_count) def reduce_from_species( - rasters_directory: str, - output_directory: str, + rasters_directory: Path, + output_directory: Path, processes_count: int, ) -> None: - files = list(Path(rasters_directory).glob("**/*.tif")) + files = list(rasters_directory.glob("**/*.tif")) print(f"total items: {len(files)}") if not files: sys.exit(f"No files in {rasters_directory}, aborting") - buckets = {} + buckets: dict[str,list[Path]] = {} for filename in files: threat_code = filename.parts[-2] levels = threat_code.split('.') @@ -159,31 +153,30 @@ def reduce_from_species( print(f"Found {len(buckets)} threats at current level:") for code, files in buckets.items(): - target_output = os.path.join(output_directory, f"{code}.tif") + target_output = output_directory / f"{code}.tif" print(f"processing {code}: {len(files)} items") raster_sum(files, target_output, processes_count) - def threat_summation( - rasters_directory: str, - output_directory: str, + rasters_directory: Path, + output_directory: Path, processes_count: int, ) -> None: os.makedirs(output_directory, exist_ok=True) # All these files are at level3 to start with, so first make level2 print("processing level 2") - level2_target = os.path.join(output_directory, "level2") + level2_target = output_directory / "level2" reduce_from_species(rasters_directory, level2_target, processes_count) # Now reduce level2 to level1 print("processing level 1") - level1_target = os.path.join(output_directory, "level1") + level1_target = output_directory / "level1" reduce_to_next_level(level2_target, level1_target, processes_count) # Now build a final top level STAR print("processing level 0") - final_target = os.path.join(output_directory, "level0") + final_target = output_directory / "level0" reduce_to_next_level(level1_target, final_target, processes_count) @@ -191,14 +184,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Generates the combined, and level 1 and level 2 threat rasters.") parser.add_argument( "--threat_rasters", - type=str, + type=Path, required=True, dest="rasters_directory", help="GeoTIFF file containing level three per species threats" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_directory", help="Destination directory file for results." diff --git a/utils/aoh_generator.py b/utils/aoh_generator.py index 1392f36..2408cf0 100644 --- a/utils/aoh_generator.py +++ b/utils/aoh_generator.py @@ -7,12 +7,11 @@ import pandas as pd def aoh_generator( - input_dir: str, - data_dir: str, - output_csv_path: str + input_dir: Path, + data_dir: Path, + output_csv_path: Path, ): - taxa_dirs = Path(input_dir).glob("[!.]*") - data_dir = Path(data_dir) + taxa_dirs = input_dir.glob("[!.]*") res = [] for taxa_dir_path in taxa_dirs: @@ -49,21 +48,21 @@ def main() -> None: parser = argparse.ArgumentParser(description="Species and seasonality generator.") parser.add_argument( '--input', - type=str, + type=Path, help="directory with taxa folders of species info", required=True, dest="input_dir" ) parser.add_argument( '--datadir', - type=str, + type=Path, help="directory for results", required=True, dest="data_dir", ) parser.add_argument( '--output', - type=str, + type=Path, help="name of output file for csv", required=True, dest="output" diff --git a/utils/collect_validation_data.py b/utils/collect_validation_data.py index 278d71d..49a3563 100644 --- a/utils/collect_validation_data.py +++ b/utils/collect_validation_data.py @@ -1,13 +1,14 @@ import argparse import os import shutil +from pathlib import Path import pandas as pd def collect_validation_data( - model_results_path: str, - data_dir: str, - output_dir: str, + model_results_path: Path, + data_dir: Path, + output_dir: Path, ) -> None: model_results = pd.read_csv(model_results_path) os.makedirs(output_dir, exist_ok=True) @@ -29,21 +30,21 @@ def main() -> None: parser = argparse.ArgumentParser(description="Collected range/AOH for species that failed validation") parser.add_argument( '--model_results', - type=str, + type=Path, help="directory with taxa folders of species info", required=True, dest="model_results_path" ) parser.add_argument( '--datadir', - type=str, + type=Path, help="directory for results", required=True, dest="data_dir", ) parser.add_argument( '--output', - type=str, + type=Path, help="name of output directory", required=True, dest="output" diff --git a/utils/threats_generator.py b/utils/threats_generator.py index 7b2a281..6538e8c 100644 --- a/utils/threats_generator.py +++ b/utils/threats_generator.py @@ -7,12 +7,11 @@ import pandas as pd def threats_generator( - input_dir: str, - data_dir: str, - output_csv_path: str + input_dir: Path, + data_dir: Path, + output_csv_path: Path, ): - taxa_dirs = Path(input_dir).glob("[!.]*") - data_dir = Path(data_dir) + taxa_dirs = input_dir.glob("[!.]*") res = [] for taxa_dir_path in taxa_dirs: @@ -40,24 +39,24 @@ def threats_generator( df.to_csv(output_csv_path, index=False) def main() -> None: - parser = argparse.ArgumentParser(description="threat tasts generator.") + parser = argparse.ArgumentParser(description="threat tasks generator.") parser.add_argument( '--input', - type=str, + type=Path, help="directory with taxa folders of species info", required=True, dest="input_dir" ) parser.add_argument( '--datadir', - type=str, + type=Path, help="directory for results", required=True, dest="data_dir", ) parser.add_argument( '--output', - type=str, + type=Path, help="name of output file for csv", required=True, dest="output"