diff --git a/DATASET_GUIDE.md b/DATASET_GUIDE.md index 41101491..d9fc1907 100644 --- a/DATASET_GUIDE.md +++ b/DATASET_GUIDE.md @@ -19,6 +19,7 @@ This document provides a detailed overview of the datasets used in this reposito ### 🧪 Community-Contributed Datasets - [Potsdam](#potsdam) +- [Open-Canopy](#open-canopy) - [Geo-Bench Datasets](#geo-bench-datasets) - [Multi-label Classification (e.g., m-BigEarthNet)](#for-multi-label-classification-eg-m-bigearthnet) - [Single-label Classification (e.g., m-EuroSat, m-Brick-Kiln)](#for-single-label-classification-ie-m-eurosat-m-brick-kiln-m-forestnet-m-pv4ger-m-so2sat) @@ -257,6 +258,17 @@ This document provides a detailed overview of the datasets used in this reposito criterion=cross_entropy \ task=segmentation ``` +### Open-Canopy + ``` + torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \ + --config-name=train \ + dataset=opencanopy \ + encoder=dofa \ + decoder=reg_upernet \ + preprocessing=reg_default \ + criterion=mse \ + task=regression + ``` ### Geo-Bench Datasets Note that `export GEO_BENCH_DIR=YOUR/PATH/DIR` is required. - For multi-label linear classification, e.g., m-BigEarthNet diff --git a/README.md b/README.md index a644d00d..24e493d2 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ And the following **datasets**: **Note**: The following datasets are **community-contributed** and are not part of the original benchmark repository. We are grateful for these contributions, which help enrich the benchmark's diversity and applicability. - **Potsdam dataset** [[Link](https://www.isprs.org/education/benchmarks/UrbanSemLab/2d-sem-label-potsdam.aspx)]. Contributed by [@pierreadorni](https://github.com/pierreadorni). +- **Open-Canopy** [[Link](https://arxiv.org/abs/2407.09392)]. Contributed by [@pierreadorni](https://github.com/pierreadorni). - **Geo-Bench datasets** [[Link](https://github.com/ServiceNow/geo-bench)]. Contributed by [@yurujaja](https://github.com/yurujaja). The repository supports the following **tasks** using geospatial (foundation) models: diff --git a/configs/dataset/opencanopy.yaml b/configs/dataset/opencanopy.yaml new file mode 100644 index 00000000..64123c77 --- /dev/null +++ b/configs/dataset/opencanopy.yaml @@ -0,0 +1,35 @@ +_target_: pangaea.datasets.opencanopy.OpenCanopy +dataset_name: OpenCanopy +root_path: ./data/canopy_height +download_url: +auto_download: True + +img_size: 667 +multi_temporal: False +multi_modal: False + +ignore_index: -1 +num_classes: 1 +classes: + - regression +distribution: + - 1. + +bands: + optical: + - B2 + - B3 + - B4 + - B8 + +data_mean: + optical: [124, 124, 124, 124] + +data_std: + optical: [124, 124, 124, 124] + +data_min: + optical: [0, 0, 0, 0] + +data_max: + optical: [255, 255, 255, 255] diff --git a/pangaea/datasets/opencanopy.py b/pangaea/datasets/opencanopy.py new file mode 100644 index 00000000..2f8af505 --- /dev/null +++ b/pangaea/datasets/opencanopy.py @@ -0,0 +1,153 @@ +### +# Open-Canopy Dataset +# original code https://github.com/fajwel/Open-Canopy +### + +import json +import os + +import numpy as np +import rasterio +import torch + +from pangaea.datasets.base import RawGeoFMDataset + +class OpenCanopy(RawGeoFMDataset): + def __init__( + self, + split: str, + dataset_name: str, + multi_modal: bool, + multi_temporal: int, + root_path: str, + classes: list, + num_classes: int, + ignore_index: int, + img_size: int, + bands: dict[str, list[str]], + distribution: list[int], + data_mean: dict[str, list[str]], + data_std: dict[str, list[str]], + data_min: dict[str, list[str]], + data_max: dict[str, list[str]], + download_url: str, + auto_download: bool, + ): + """Initialize the Open-Canopy dataset. + + Args: + split (str): split of the dataset (train, val, test). + dataset_name (str): dataset name. + multi_modal (bool): if the dataset is multi-modal. + multi_temporal (int): number of temporal frames. + root_path (str): root path of the dataset. + classes (list): classes of the dataset. + num_classes (int): number of classes. + ignore_index (int): index to ignore for metrics and loss. + img_size (int): size of the image. + bands (dict[str, list[str]]): bands of the dataset. + distribution (list[int]): class distribution. + data_mean (dict[str, list[str]]): mean for each band for each modality. + Dictionary with keys as the modality and values as the list of means. + e.g. {"s2": [b1_mean, ..., bn_mean], "s1": [b1_mean, ..., bn_mean]} + data_std (dict[str, list[str]]): str for each band for each modality. + Dictionary with keys as the modality and values as the list of stds. + e.g. {"s2": [b1_std, ..., bn_std], "s1": [b1_std, ..., bn_std]} + data_min (dict[str, list[str]]): min for each band for each modality. + Dictionary with keys as the modality and values as the list of mins. + e.g. {"s2": [b1_min, ..., bn_min], "s1": [b1_min, ..., bn_min]} + data_max (dict[str, list[str]]): max for each band for each modality. + Dictionary with keys as the modality and values as the list of maxs. + e.g. {"s2": [b1_max, ..., bn_max], "s1": [b1_max, ..., bn_max]} + download_url (str): url to download the dataset. + auto_download (bool): whether to download the dataset automatically. + """ + super(OpenCanopy, self).__init__( + split=split, + dataset_name=dataset_name, + multi_modal=multi_modal, + multi_temporal=multi_temporal, + root_path=root_path, + classes=classes, + num_classes=num_classes, + ignore_index=ignore_index, + img_size=img_size, + bands=bands, + distribution=distribution, + data_mean=data_mean, + data_std=data_std, + data_min=data_min, + data_max=data_max, + download_url=download_url, + auto_download=auto_download, + ) + + assert split in ["train", "val", "test"], "Split must be train, val or test" + with open("data/canopy_height/geometries.geojson", "r") as f: + self.metadata = json.load(f) + + # delete all geometries that are not in the split + self.metadata["features"] = [ + feature for feature in self.metadata["features"] if feature["properties"]["split"] == split + ] + + + def __getitem__(self, i: int) -> dict[str, torch.Tensor | dict[str, torch.Tensor]]: + """Get the item at index i. + + Args: + i (int): index of the item. + + Returns: + dict[str, torch.Tensor | dict[str, torch.Tensor]]: output dictionary follwing the format + {"image": + {"optical": torch.Tensor, + "sar": torch.Tensor}, + "target": torch.Tensor, + "metadata": dict}. + """ + + tile = self.metadata["features"][i] + filename = tile["properties"]["image_name"] + year = filename.split("_")[-1][:4] + spot_folder = f"data/canopy_height/{year}/spot/" + lidar_folder = f"data/canopy_height/{year}/lidar/" + spot_path = os.path.join(spot_folder, filename) + lidar_path = os.path.join(lidar_folder, "compressed_lidar_" + filename.split("_")[-1]) + + coords = tile["geometry"]["coordinates"] + + with rasterio.open(spot_path) as src: + window = rasterio.windows.from_bounds( + coords[0][0][0], coords[0][0][1], coords[0][2][0], coords[0][2][1], + transform=src.transform + ) + rgbir = torch.Tensor(src.read(window=window)) + + with rasterio.open(lidar_path) as src: + canopy_height = torch.Tensor(src.read(1, window=window)) + + return { + "image": { + "optical": rgbir.to(torch.float).unsqueeze(1), + }, + "target": canopy_height.to(torch.float), + "metadata": {}, + } + + def __len__(self) -> int: + """Return the length of the dataset. + + Returns: + int: length of the dataset. + """ + return len(self.metadata["features"]) + + @staticmethod + def download(self): + from huggingface_hub import snapshot_download + snapshot_download( + repo_id="AI4Forest/Open-Canopy", + repo_type="dataset", + local_dir="data", + )