Skip to content
32 changes: 7 additions & 25 deletions scripts/generation/save_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

# Set the start method for torch multiprocessing. Choose either "forkserver" or "spawn" to be
# compatible with dask's multiprocessing.
mp.set_start_method("forkserver")
mp.set_start_method("spawn")

# Set the sharing strategy to 'file_system' to handle file descriptor limitations. This is
# important because libraries like Zarr may open many files, which can exhaust the file
Expand All @@ -43,9 +43,8 @@

import dask
import hydra
from ocf_data_sampler.torch_datasets.datasets import PVNetUKRegionalDataset, SitesDataset
from ocf_data_sampler.torch_datasets.sample.site import SiteSample
from ocf_data_sampler.torch_datasets.sample.uk_regional import UKRegionalSample
from ocf_data_sampler.torch_datasets.pvnet_dataset import PVNetDataset
import torch
from omegaconf import DictConfig, OmegaConf
from sqlalchemy import exc as sa_exc
from torch.utils.data import DataLoader, Dataset
Expand Down Expand Up @@ -77,33 +76,16 @@ def __init__(self, save_dir: str, renewable: str = "pv_uk"):

def __call__(self, sample, sample_num: int):
"""Save a sample to disk"""
save_path = f"{self.save_dir}/{sample_num:08}"

if self.renewable == "pv_uk":
sample_class = UKRegionalSample(sample)
filename = f"{save_path}.pt"
elif self.renewable == "site":
sample_class = SiteSample(sample)
filename = f"{save_path}.nc"
else:
raise ValueError(f"Unknown renewable: {self.renewable}")
# Assign data and save
sample_class._data = sample
sample_class.save(filename)
save_path = f"{self.save_dir}/{sample_num:08}.pt"
torch.save(sample, save_path)


def get_dataset(
config_path: str, start_time: str, end_time: str, renewable: str = "pv_uk"
) -> Dataset:
"""Get the dataset for the given renewable type."""
if renewable == "pv_uk":
dataset_cls = PVNetUKRegionalDataset
elif renewable == "site":
dataset_cls = SitesDataset
else:
raise ValueError(f"Unknown renewable: {renewable}")

return dataset_cls(config_path, start_time=start_time, end_time=end_time)
# Ignoring renewable parameter as PVNetDataset is generic
return PVNetDataset(config_path, start_time=start_time, end_time=end_time)


def save_samples_with_dataloader(
Expand Down
62 changes: 62 additions & 0 deletions src/open_data_pvnet/INDIA_README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# India Solar Data Pipeline for PVNet

This contribution adds support for **India solar generation data** to the open-data-pvnet project.

## Data Source

**Mendeley Dataset**: [DOI 10.17632/y58jknpgs8.2](https://data.mendeley.com/datasets/y58jknpgs8/2)
- 29 monthly Excel files (Sep 2021 - Jun 2025)
- 5-minute resolution solar/wind generation data
- Covers all 5 Indian regional grids (NR, WR, SR, ER, NER)

## Files Added

### Configuration Files
| File | Description |
|------|-------------|
| `configs/india_pv_data_config.yaml` | India solar data settings |
| `configs/india_gfs_config.yaml` | GFS NWP config for India region |
| `configs/india_regions.csv` | 5 regional grid metadata |
| `configs/PVNet_configs/datamodule/configuration/india_configuration.yaml` | Complete PVNet config |

### Scripts
| File | Description |
|------|-------------|
| `scripts/download_mendeley_india.py` | Dataset download instructions |
| `scripts/process_india_data.py` | Excel → Zarr conversion |
| `scripts/test_india_pipeline.py` | Pipeline validation tests |
| `scripts/train_india_baseline.py` | Solar-only baseline model |

## Data Processing Results

| Metric | Value |
|--------|-------|
| **Rows** | 5,184 hourly |
| **Date Range** | Jan 1, 2024 → Jun 30, 2025 |
| **Mean Solar** | 15,899 MW |
| **Max Solar** | 64,701 MW |

## Baseline Model Results

A simple temporal model (hour, month, lag features) achieves:
- **RMSE**: 8,270 MW
- **MAE/Mean**: ~52%

## Known Limitations

1. **2021-2023 data**: Uses SCADA codes as column headers - requires manual mapping
2. **NWP coverage**: OCF's GFS S3 data only covers UK region. India NWP needs NOAA GFS processing.

## Next Steps

1. Process NOAA GFS for India (68-98°E, 6-38°N)
2. Add 2021-2023 data with SCADA code mapping
3. Integrate with full PVNet model architecture

## Related Issue

Closes #121 (India contribution)

---

*Contribution by Siddhant Jain ([@Raakshass](https://github.com/Raakshass)) for GSoC 2026*
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# India PVNet Configuration
# Complete configuration for training PVNet on India solar data with GFS NWP

general:
description: India solar generation forecasting configuration
name: india_pvnet_config

input_data:
# India Solar Generation Data (All-India national aggregate)
# Uses 'generation:' (generic PV data) not 'gsp:' (UK-specific Grid Supply Point)
generation:
# Local Zarr path - update to S3 path when uploaded
zarr_path: "C:/Users/asus vivoBook/Desktop/New folder (2)/pvnet-india-data/processed/india_solar_2024-2025.zarr"
interval_start_minutes: -60 # 1 hour history
interval_end_minutes: 480 # 8 hours forecast
time_resolution_minutes: 60 # Hourly data (60 min)
dropout_timedeltas_minutes: []
dropout_fraction: 0.0
public: false # Local data

# GFS NWP Data for India region
nwp:
gfs:
time_resolution_minutes: 180 # 3-hourly GFS forecasts
interval_start_minutes: -180 # 3 hours before t0
interval_end_minutes: 540 # 9 hours after t0
dropout_fraction: 0.0
dropout_timedeltas_minutes: []

# Global GFS data from OCF S3
zarr_path: "s3://ocf-open-data-pvnet/data/gfs/v4/2024.zarr"
provider: "gfs"
public: true

# Spatial sampling (small patch around site)
# Note: ocf-data-sampler uses generation coordinates (lon/lat) for spatial sampling
image_size_pixels_height: 4
image_size_pixels_width: 4

# Weather channels for solar prediction
channels:
- dlwrf # downwards long-wave radiation flux
- dswrf # downwards short-wave radiation flux (critical for solar)
- hcc # high cloud cover
- lcc # low cloud cover
- mcc # medium cloud cover
- prate # precipitation rate
- r # relative humidity
- t # 2-metre temperature
- tcc # total cloud cover (critical for solar)
- u10 # 10-metre wind U component
- u100 # 100-metre wind U component
- v10 # 10-metre wind V component
- v100 # 100-metre wind V component
- vis # visibility

# GFS normalisation constants (global stats)
normalisation_constants:
dlwrf:
mean: 298.342
std: 96.305916
dswrf:
mean: 168.12321
std: 246.18533
hcc:
mean: 35.272
std: 42.525383
lcc:
mean: 43.578342
std: 44.3732
mcc:
mean: 33.738823
std: 43.150745
prate:
mean: 2.8190969e-05
std: 0.00010159573
r:
mean: 18.359747
std: 25.440672
t:
mean: 278.5223
std: 22.825893
tcc:
mean: 66.841606
std: 41.030598
u10:
mean: -0.0022310058
std: 5.470838
u100:
mean: 0.0823025
std: 6.8899174
v10:
mean: 0.06219831
std: 4.7401133
v100:
mean: 0.0797807
std: 6.076132
vis:
mean: 19628.32
std: 8294.022

# Solar position input
solar_position:
interval_start_minutes: -60
interval_end_minutes: 480
time_resolution_minutes: 60
92 changes: 92 additions & 0 deletions src/open_data_pvnet/configs/india_gfs_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
general:
name: "india_gfs_config"
description: "Configuration for GFS NWP data sampling over India region"

input_data:
nwp:
gfs:
# GFS provides 3-hourly forecasts globally
time_resolution_minutes: 180
interval_start_minutes: -180
interval_end_minutes: 540
dropout_timedeltas_minutes: null
dropout_fraction: 0.0
accum_channels: []
max_staleness_minutes: 540

# Use existing OCF GFS data - filter to India bounds at runtime
zarr_path: "s3://ocf-open-data-pvnet/data/gfs/v4/2023.zarr"
provider: "gfs"
public: true

# India bounding box (approximate)
# North: 38°N (Kashmir), South: 6°N (Kanyakumari)
# West: 68°E (Gujarat), East: 98°E (Arunachal Pradesh)
latitude_bounds: [6.0, 38.0]
longitude_bounds: [68.0, 98.0]

# Spatial sampling
image_size_pixels_height: 2
image_size_pixels_width: 2

# Weather channels for solar prediction
channels:
- dlwrf # downwards long-wave radiation flux
- dswrf # downwards short-wave radiation flux
- hcc # high cloud cover
- lcc # low cloud cover
- mcc # medium cloud cover
- prate # precipitation rate
- r # relative humidity
- t # 2-metre temperature
- tcc # total cloud cover
- u10 # 10-metre wind U component
- u100 # 100-metre wind U component
- v10 # 10-metre wind V component
- v100 # 100-metre wind V component
- vis # visibility

# Normalisation constants (using global GFS stats from UK config)
normalisation_constants:
dlwrf:
mean: 298.342
std: 96.305916
dswrf:
mean: 168.12321
std: 246.18533
hcc:
mean: 35.272
std: 42.525383
lcc:
mean: 43.578342
std: 44.3732
mcc:
mean: 33.738823
std: 43.150745
prate:
mean: 2.8190969e-05
std: 0.00010159573
r:
mean: 18.359747
std: 25.440672
t:
mean: 278.5223
std: 22.825893
tcc:
mean: 66.841606
std: 41.030598
u10:
mean: -0.0022310058
std: 5.470838
u100:
mean: 0.0823025
std: 6.8899174
v10:
mean: 0.06219831
std: 4.7401133
v100:
mean: 0.0797807
std: 6.076132
vis:
mean: 19628.32
std: 8294.022
67 changes: 67 additions & 0 deletions src/open_data_pvnet/configs/india_pv_data_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
general:
name: "india_pv_config"
description: "India solar generation data configuration from Grid-India Mendeley dataset"

input_data:
# India uses "gsp" structure but with regional IDs instead of UK GSP IDs
# region_id mapping:
# 0: Northern Region (NR) - Delhi/NCR
# 1: Western Region (WR) - Mumbai/Gujarat
# 2: Southern Region (SR) - Chennai/Karnataka
# 3: Eastern Region (ER) - Kolkata/Bihar
# 4: North-Eastern Region (NER) - Guwahati/Assam

gsp:
# Path to processed India solar Zarr (to be uploaded after processing)
zarr_path: "data/india/india_solar_2021-2023.zarr"

# Mendeley data is hourly (60 minutes)
time_resolution_minutes: 60

# History and forecast windows
interval_start_minutes: -60 # 1 hour of history
interval_end_minutes: 480 # 8 hours of forecast

# No dropout for initial training
dropout_timedeltas_minutes: []
dropout_fraction: 0.0

public: true

# India regional grid metadata
# Coordinates are approximate centroids of each regional grid
regions:
- region_id: 0
name: "Northern Region (NR)"
abbreviation: "NR"
latitude: 28.6139
longitude: 77.2090
states: ["Delhi", "Haryana", "Punjab", "Rajasthan", "UP", "UK", "HP", "J&K"]

- region_id: 1
name: "Western Region (WR)"
abbreviation: "WR"
latitude: 19.0760
longitude: 72.8777
states: ["Maharashtra", "Gujarat", "MP", "Chhattisgarh", "Goa", "Daman & Diu"]

- region_id: 2
name: "Southern Region (SR)"
abbreviation: "SR"
latitude: 13.0827
longitude: 80.2707
states: ["Tamil Nadu", "Karnataka", "Kerala", "Andhra Pradesh", "Telangana", "Puducherry"]

- region_id: 3
name: "Eastern Region (ER)"
abbreviation: "ER"
latitude: 22.5726
longitude: 88.3639
states: ["West Bengal", "Bihar", "Jharkhand", "Odisha", "Sikkim"]

- region_id: 4
name: "North-Eastern Region (NER)"
abbreviation: "NER"
latitude: 26.1445
longitude: 91.7362
states: ["Assam", "Arunachal Pradesh", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Tripura"]
6 changes: 6 additions & 0 deletions src/open_data_pvnet/configs/india_regions.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
region_id,region_name,abbreviation,latitude,longitude,capacity_mw,states
0,Northern Region,NR,28.6139,77.2090,,Delhi|Haryana|Punjab|Rajasthan|Uttar Pradesh|Uttarakhand|Himachal Pradesh|Jammu & Kashmir
1,Western Region,WR,19.0760,72.8777,,Maharashtra|Gujarat|Madhya Pradesh|Chhattisgarh|Goa|Daman & Diu
2,Southern Region,SR,13.0827,80.2707,,Tamil Nadu|Karnataka|Kerala|Andhra Pradesh|Telangana|Puducherry
3,Eastern Region,ER,22.5726,88.3639,,West Bengal|Bihar|Jharkhand|Odisha|Sikkim
4,North-Eastern Region,NER,26.1445,91.7362,,Assam|Arunachal Pradesh|Manipur|Meghalaya|Mizoram|Nagaland|Tripura
Loading