VMarsocci · yurujaja · Jul 7, 2025 · Jun 20, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/DATASET_GUIDE.md b/DATASET_GUIDE.md
@@ -19,6 +19,7 @@ This document provides a detailed overview of the datasets used in this reposito
 
 ### 🧪 Community-Contributed Datasets
 - [Potsdam](#potsdam)
+- [Open-Canopy](#open-canopy)
 - [Geo-Bench Datasets](#geo-bench-datasets)
   - [Multi-label Classification (e.g., m-BigEarthNet)](#for-multi-label-classification-eg-m-bigearthnet)
   - [Single-label Classification (e.g., m-EuroSat, m-Brick-Kiln)](#for-single-label-classification-ie-m-eurosat-m-brick-kiln-m-forestnet-m-pv4ger-m-so2sat)
@@ -257,6 +258,17 @@ This document provides a detailed overview of the datasets used in this reposito
    criterion=cross_entropy \
    task=segmentation
   ```
+### Open-Canopy
+   ```
+    torchrun --nnodes=1 --nproc_per_node=1 pangaea/run.py \
+    --config-name=train \
+    dataset=opencanopy \
+    encoder=dofa \
+    decoder=reg_upernet \
+    preprocessing=reg_default \
+    criterion=mse \
+    task=regression
+  ```
 ### Geo-Bench Datasets 
 Note that `export GEO_BENCH_DIR=YOUR/PATH/DIR` is required.
 -  For multi-label linear classification, e.g., m-BigEarthNet

diff --git a/README.md b/README.md
@@ -85,6 +85,7 @@ And the following **datasets**:
 
 **Note**: The following datasets are **community-contributed** and are not part of the original benchmark repository. We are grateful for these contributions, which help enrich the benchmark's diversity and applicability.
 - **Potsdam dataset** [[Link](https://www.isprs.org/education/benchmarks/UrbanSemLab/2d-sem-label-potsdam.aspx)]. Contributed by [@pierreadorni](https://github.com/pierreadorni).
+- **Open-Canopy** [[Link](https://arxiv.org/abs/2407.09392)]. Contributed by [@pierreadorni](https://github.com/pierreadorni).
 - **Geo-Bench datasets** [[Link](https://github.com/ServiceNow/geo-bench)]. Contributed by [@yurujaja](https://github.com/yurujaja).
 
 The repository supports the following **tasks** using geospatial (foundation) models:

diff --git a/configs/dataset/opencanopy.yaml b/configs/dataset/opencanopy.yaml
@@ -0,0 +1,35 @@
+_target_: pangaea.datasets.opencanopy.OpenCanopy
+dataset_name: OpenCanopy
+root_path: ./data/canopy_height
+download_url: 
+auto_download: True
+
+img_size: 667
+multi_temporal: False
+multi_modal: False
+
+ignore_index: -1
+num_classes: 1
+classes:
+  - regression
+distribution:
+  - 1.
+
+bands:
+  optical:
+    - B2
+    - B3
+    - B4
+    - B8
+
+data_mean:
+  optical: [124, 124, 124, 124]
+
+data_std:
+  optical: [124, 124, 124, 124]
+
+data_min:
+  optical: [0, 0, 0, 0]
+
+data_max:
+  optical: [255, 255, 255, 255]
diff --git a/pangaea/datasets/opencanopy.py b/pangaea/datasets/opencanopy.py
@@ -0,0 +1,153 @@
+###
+# Open-Canopy Dataset
+# original code https://github.com/fajwel/Open-Canopy
+###
+
+import json
+import os
+
+import numpy as np
+import rasterio
+import torch
+
+from pangaea.datasets.base import RawGeoFMDataset
+
+class OpenCanopy(RawGeoFMDataset):
+    def __init__(
+        self,
+        split: str,
+        dataset_name: str,
+        multi_modal: bool,
+        multi_temporal: int,
+        root_path: str,
+        classes: list,
+        num_classes: int,
+        ignore_index: int,
+        img_size: int,
+        bands: dict[str, list[str]],
+        distribution: list[int],
+        data_mean: dict[str, list[str]],
+        data_std: dict[str, list[str]],
+        data_min: dict[str, list[str]],
+        data_max: dict[str, list[str]],
+        download_url: str,
+        auto_download: bool,
+    ):
+        """Initialize the Open-Canopy dataset.
+
+        Args:
+            split (str): split of the dataset (train, val, test).
+            dataset_name (str): dataset name.
+            multi_modal (bool): if the dataset is multi-modal.
+            multi_temporal (int): number of temporal frames.
+            root_path (str): root path of the dataset.
+            classes (list): classes of the dataset.
+            num_classes (int): number of classes.
+            ignore_index (int): index to ignore for metrics and loss.
+            img_size (int): size of the image.
+            bands (dict[str, list[str]]): bands of the dataset.
+            distribution (list[int]): class distribution.
+            data_mean (dict[str, list[str]]): mean for each band for each modality.
+            Dictionary with keys as the modality and values as the list of means.
+            e.g. {"s2": [b1_mean, ..., bn_mean], "s1": [b1_mean, ..., bn_mean]}
+            data_std (dict[str, list[str]]): str for each band for each modality.
+            Dictionary with keys as the modality and values as the list of stds.
+            e.g. {"s2": [b1_std, ..., bn_std], "s1": [b1_std, ..., bn_std]}
+            data_min (dict[str, list[str]]): min for each band for each modality.
+            Dictionary with keys as the modality and values as the list of mins.
+            e.g. {"s2": [b1_min, ..., bn_min], "s1": [b1_min, ..., bn_min]}
+            data_max (dict[str, list[str]]): max for each band for each modality.
+            Dictionary with keys as the modality and values as the list of maxs.
+            e.g. {"s2": [b1_max, ..., bn_max], "s1": [b1_max, ..., bn_max]}
+            download_url (str): url to download the dataset.
+            auto_download (bool): whether to download the dataset automatically.
+        """
+        super(OpenCanopy, self).__init__(
+            split=split,
+            dataset_name=dataset_name,
+            multi_modal=multi_modal,
+            multi_temporal=multi_temporal,
+            root_path=root_path,
+            classes=classes,
+            num_classes=num_classes,
+            ignore_index=ignore_index,
+            img_size=img_size,
+            bands=bands,
+            distribution=distribution,
+            data_mean=data_mean,
+            data_std=data_std,
+            data_min=data_min,
+            data_max=data_max,
+            download_url=download_url,
+            auto_download=auto_download,
+        )
+
+        assert split in ["train", "val", "test"], "Split must be train, val or test"
+        with open("data/canopy_height/geometries.geojson", "r") as f:
+            self.metadata = json.load(f)
+
+        # delete all geometries that are not in the split
+        self.metadata["features"] = [
+            feature for feature in self.metadata["features"] if feature["properties"]["split"] == split
+        ]
+
+
+    def __getitem__(self, i: int) -> dict[str, torch.Tensor | dict[str, torch.Tensor]]:
+        """Get the item at index i.
+
+        Args:
+            i (int): index of the item.
+
+        Returns:
+            dict[str, torch.Tensor | dict[str, torch.Tensor]]: output dictionary follwing the format
+            {"image":
+                {"optical": torch.Tensor,
+                 "sar": torch.Tensor},
+            "target": torch.Tensor,
+             "metadata": dict}.
+        """
+
+        tile = self.metadata["features"][i]
+        filename = tile["properties"]["image_name"]
+        year = filename.split("_")[-1][:4]
+        spot_folder = f"data/canopy_height/{year}/spot/"
+        lidar_folder = f"data/canopy_height/{year}/lidar/"
+        spot_path = os.path.join(spot_folder, filename)
+        lidar_path = os.path.join(lidar_folder, "compressed_lidar_" + filename.split("_")[-1])
+
+        coords = tile["geometry"]["coordinates"]
+
+        with rasterio.open(spot_path) as src:
+            window = rasterio.windows.from_bounds(
+                coords[0][0][0], coords[0][0][1], coords[0][2][0], coords[0][2][1],
+                transform=src.transform
+            )
+            rgbir = torch.Tensor(src.read(window=window))
+
+        with rasterio.open(lidar_path) as src:
+            canopy_height = torch.Tensor(src.read(1, window=window))
+
+        return {
+            "image": {
+                "optical": rgbir.to(torch.float).unsqueeze(1),
+            },
+            "target": canopy_height.to(torch.float),
+            "metadata": {},
+        }
+
+    def __len__(self) -> int:
+        """Return the length of the dataset.
+
+        Returns:
+            int: length of the dataset.
+        """
+        return len(self.metadata["features"])
+
+    @staticmethod
+    def download(self):
+        from huggingface_hub import snapshot_download
+        snapshot_download(
+            repo_id="AI4Forest/Open-Canopy",
+            repo_type="dataset",
+            local_dir="data",
+        )