Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions DATASET_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ This document provides a detailed overview of the datasets used in this reposito

### 🧪 Community-Contributed Datasets
- [Potsdam](#potsdam)
- [Open-Canopy](#open-canopy)
- [Geo-Bench Datasets](#geo-bench-datasets)
- [Multi-label Classification (e.g., m-BigEarthNet)](#for-multi-label-classification-eg-m-bigearthnet)
- [Single-label Classification (e.g., m-EuroSat, m-Brick-Kiln)](#for-single-label-classification-ie-m-eurosat-m-brick-kiln-m-forestnet-m-pv4ger-m-so2sat)
Expand Down Expand Up @@ -257,6 +258,17 @@ This document provides a detailed overview of the datasets used in this reposito
criterion=cross_entropy \
task=segmentation
```
### Open-Canopy
```
torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \
--config-name=train \
dataset=opencanopy \
encoder=dofa \
decoder=reg_upernet \
preprocessing=reg_default \
criterion=mse \
task=regression
```
### Geo-Bench Datasets
Note that `export GEO_BENCH_DIR=YOUR/PATH/DIR` is required.
- For multi-label linear classification, e.g., m-BigEarthNet
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ And the following **datasets**:

**Note**: The following datasets are **community-contributed** and are not part of the original benchmark repository. We are grateful for these contributions, which help enrich the benchmark's diversity and applicability.
- **Potsdam dataset** [[Link](https://www.isprs.org/education/benchmarks/UrbanSemLab/2d-sem-label-potsdam.aspx)]. Contributed by [@pierreadorni](https://github.com/pierreadorni).
- **Open-Canopy** [[Link](https://arxiv.org/abs/2407.09392)]. Contributed by [@pierreadorni](https://github.com/pierreadorni).
- **Geo-Bench datasets** [[Link](https://github.com/ServiceNow/geo-bench)]. Contributed by [@yurujaja](https://github.com/yurujaja).

The repository supports the following **tasks** using geospatial (foundation) models:
Expand Down
35 changes: 35 additions & 0 deletions configs/dataset/opencanopy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
_target_: pangaea.datasets.opencanopy.OpenCanopy
dataset_name: OpenCanopy
root_path: ./data/canopy_height
download_url:
auto_download: True

img_size: 667
multi_temporal: False
multi_modal: False

ignore_index: -1
num_classes: 1
classes:
- regression
distribution:
- 1.

bands:
optical:
- B2
- B3
- B4
- B8

data_mean:
optical: [124, 124, 124, 124]

data_std:
optical: [124, 124, 124, 124]

data_min:
optical: [0, 0, 0, 0]

data_max:
optical: [255, 255, 255, 255]
153 changes: 153 additions & 0 deletions pangaea/datasets/opencanopy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
###
# Open-Canopy Dataset
# original code https://github.com/fajwel/Open-Canopy
###

import json
import os

import numpy as np
import rasterio
import torch

from pangaea.datasets.base import RawGeoFMDataset

class OpenCanopy(RawGeoFMDataset):
def __init__(
self,
split: str,
dataset_name: str,
multi_modal: bool,
multi_temporal: int,
root_path: str,
classes: list,
num_classes: int,
ignore_index: int,
img_size: int,
bands: dict[str, list[str]],
distribution: list[int],
data_mean: dict[str, list[str]],
data_std: dict[str, list[str]],
data_min: dict[str, list[str]],
data_max: dict[str, list[str]],
download_url: str,
auto_download: bool,
):
"""Initialize the Open-Canopy dataset.

Args:
split (str): split of the dataset (train, val, test).
dataset_name (str): dataset name.
multi_modal (bool): if the dataset is multi-modal.
multi_temporal (int): number of temporal frames.
root_path (str): root path of the dataset.
classes (list): classes of the dataset.
num_classes (int): number of classes.
ignore_index (int): index to ignore for metrics and loss.
img_size (int): size of the image.
bands (dict[str, list[str]]): bands of the dataset.
distribution (list[int]): class distribution.
data_mean (dict[str, list[str]]): mean for each band for each modality.
Dictionary with keys as the modality and values as the list of means.
e.g. {"s2": [b1_mean, ..., bn_mean], "s1": [b1_mean, ..., bn_mean]}
data_std (dict[str, list[str]]): str for each band for each modality.
Dictionary with keys as the modality and values as the list of stds.
e.g. {"s2": [b1_std, ..., bn_std], "s1": [b1_std, ..., bn_std]}
data_min (dict[str, list[str]]): min for each band for each modality.
Dictionary with keys as the modality and values as the list of mins.
e.g. {"s2": [b1_min, ..., bn_min], "s1": [b1_min, ..., bn_min]}
data_max (dict[str, list[str]]): max for each band for each modality.
Dictionary with keys as the modality and values as the list of maxs.
e.g. {"s2": [b1_max, ..., bn_max], "s1": [b1_max, ..., bn_max]}
download_url (str): url to download the dataset.
auto_download (bool): whether to download the dataset automatically.
"""
super(OpenCanopy, self).__init__(
split=split,
dataset_name=dataset_name,
multi_modal=multi_modal,
multi_temporal=multi_temporal,
root_path=root_path,
classes=classes,
num_classes=num_classes,
ignore_index=ignore_index,
img_size=img_size,
bands=bands,
distribution=distribution,
data_mean=data_mean,
data_std=data_std,
data_min=data_min,
data_max=data_max,
download_url=download_url,
auto_download=auto_download,
)

assert split in ["train", "val", "test"], "Split must be train, val or test"
with open("data/canopy_height/geometries.geojson", "r") as f:
self.metadata = json.load(f)

# delete all geometries that are not in the split
self.metadata["features"] = [
feature for feature in self.metadata["features"] if feature["properties"]["split"] == split
]


def __getitem__(self, i: int) -> dict[str, torch.Tensor | dict[str, torch.Tensor]]:
"""Get the item at index i.

Args:
i (int): index of the item.

Returns:
dict[str, torch.Tensor | dict[str, torch.Tensor]]: output dictionary follwing the format
{"image":
{"optical": torch.Tensor,
"sar": torch.Tensor},
"target": torch.Tensor,
"metadata": dict}.
"""

tile = self.metadata["features"][i]
filename = tile["properties"]["image_name"]
year = filename.split("_")[-1][:4]
spot_folder = f"data/canopy_height/{year}/spot/"
lidar_folder = f"data/canopy_height/{year}/lidar/"
spot_path = os.path.join(spot_folder, filename)
lidar_path = os.path.join(lidar_folder, "compressed_lidar_" + filename.split("_")[-1])

coords = tile["geometry"]["coordinates"]

with rasterio.open(spot_path) as src:
window = rasterio.windows.from_bounds(
coords[0][0][0], coords[0][0][1], coords[0][2][0], coords[0][2][1],
transform=src.transform
)
rgbir = torch.Tensor(src.read(window=window))

with rasterio.open(lidar_path) as src:
canopy_height = torch.Tensor(src.read(1, window=window))

return {
"image": {
"optical": rgbir.to(torch.float).unsqueeze(1),
},
"target": canopy_height.to(torch.float),
"metadata": {},
}

def __len__(self) -> int:
"""Return the length of the dataset.

Returns:
int: length of the dataset.
"""
return len(self.metadata["features"])

@staticmethod
def download(self):
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="AI4Forest/Open-Canopy",
repo_type="dataset",
local_dir="data",
)
Loading