diff --git a/scripts/generation/save_samples.py b/scripts/generation/save_samples.py
index 509ee72..1742b28 100644
--- a/scripts/generation/save_samples.py
+++ b/scripts/generation/save_samples.py
@@ -27,7 +27,7 @@
 
     # Set the start method for torch multiprocessing. Choose either "forkserver" or "spawn" to be
     # compatible with dask's multiprocessing.
-    mp.set_start_method("forkserver")
+    mp.set_start_method("spawn")
 
     # Set the sharing strategy to 'file_system' to handle file descriptor limitations. This is
     # important because libraries like Zarr may open many files, which can exhaust the file
@@ -43,9 +43,8 @@
 
 import dask
 import hydra
-from ocf_data_sampler.torch_datasets.datasets import PVNetUKRegionalDataset, SitesDataset
-from ocf_data_sampler.torch_datasets.sample.site import SiteSample
-from ocf_data_sampler.torch_datasets.sample.uk_regional import UKRegionalSample
+from ocf_data_sampler.torch_datasets.pvnet_dataset import PVNetDataset
+import torch
 from omegaconf import DictConfig, OmegaConf
 from sqlalchemy import exc as sa_exc
 from torch.utils.data import DataLoader, Dataset
@@ -77,33 +76,16 @@ def __init__(self, save_dir: str, renewable: str = "pv_uk"):
 
     def __call__(self, sample, sample_num: int):
         """Save a sample to disk"""
-        save_path = f"{self.save_dir}/{sample_num:08}"
-
-        if self.renewable == "pv_uk":
-            sample_class = UKRegionalSample(sample)
-            filename = f"{save_path}.pt"
-        elif self.renewable == "site":
-            sample_class = SiteSample(sample)
-            filename = f"{save_path}.nc"
-        else:
-            raise ValueError(f"Unknown renewable: {self.renewable}")
-        # Assign data and save
-        sample_class._data = sample
-        sample_class.save(filename)
+        save_path = f"{self.save_dir}/{sample_num:08}.pt"
+        torch.save(sample, save_path)
 
 
 def get_dataset(
     config_path: str, start_time: str, end_time: str, renewable: str = "pv_uk"
 ) -> Dataset:
     """Get the dataset for the given renewable type."""
-    if renewable == "pv_uk":
-        dataset_cls = PVNetUKRegionalDataset
-    elif renewable == "site":
-        dataset_cls = SitesDataset
-    else:
-        raise ValueError(f"Unknown renewable: {renewable}")
-
-    return dataset_cls(config_path, start_time=start_time, end_time=end_time)
+    # Ignoring renewable parameter as PVNetDataset is generic
+    return PVNetDataset(config_path, start_time=start_time, end_time=end_time)
 
 
 def save_samples_with_dataloader(
diff --git a/src/open_data_pvnet/INDIA_README.md b/src/open_data_pvnet/INDIA_README.md
new file mode 100644
index 0000000..20635eb
--- /dev/null
+++ b/src/open_data_pvnet/INDIA_README.md
@@ -0,0 +1,62 @@
+# India Solar Data Pipeline for PVNet
+
+This contribution adds support for **India solar generation data** to the open-data-pvnet project.
+
+## Data Source
+
+**Mendeley Dataset**: [DOI 10.17632/y58jknpgs8.2](https://data.mendeley.com/datasets/y58jknpgs8/2)
+- 29 monthly Excel files (Sep 2021 - Jun 2025)
+- 5-minute resolution solar/wind generation data
+- Covers all 5 Indian regional grids (NR, WR, SR, ER, NER)
+
+## Files Added
+
+### Configuration Files
+| File | Description |
+|------|-------------|
+| `configs/india_pv_data_config.yaml` | India solar data settings |
+| `configs/india_gfs_config.yaml` | GFS NWP config for India region |
+| `configs/india_regions.csv` | 5 regional grid metadata |
+| `configs/PVNet_configs/datamodule/configuration/india_configuration.yaml` | Complete PVNet config |
+
+### Scripts
+| File | Description |
+|------|-------------|
+| `scripts/download_mendeley_india.py` | Dataset download instructions |
+| `scripts/process_india_data.py` | Excel → Zarr conversion |
+| `scripts/test_india_pipeline.py` | Pipeline validation tests |
+| `scripts/train_india_baseline.py` | Solar-only baseline model |
+
+## Data Processing Results
+
+| Metric | Value |
+|--------|-------|
+| **Rows** | 5,184 hourly |
+| **Date Range** | Jan 1, 2024 → Jun 30, 2025 |
+| **Mean Solar** | 15,899 MW |
+| **Max Solar** | 64,701 MW |
+
+## Baseline Model Results
+
+A simple temporal model (hour, month, lag features) achieves:
+- **RMSE**: 8,270 MW
+- **MAE/Mean**: ~52%
+
+## Known Limitations
+
+1. **2021-2023 data**: Uses SCADA codes as column headers - requires manual mapping
+2. **NWP coverage**: OCF's GFS S3 data only covers UK region. India NWP needs NOAA GFS processing.
+
+## Next Steps
+
+1. Process NOAA GFS for India (68-98°E, 6-38°N)
+2. Add 2021-2023 data with SCADA code mapping
+3. Integrate with full PVNet model architecture
+
+## Related Issue
+
+Closes #121 (India contribution)
+
+---
+
+*Contribution by Siddhant Jain ([@Raakshass](https://github.com/Raakshass)) for GSoC 2026*
diff --git a/src/open_data_pvnet/configs/PVNet_configs/datamodule/configuration/india_configuration.yaml b/src/open_data_pvnet/configs/PVNet_configs/datamodule/configuration/india_configuration.yaml
new file mode 100644
index 0000000..6b74ec3
--- /dev/null
+++ b/src/open_data_pvnet/configs/PVNet_configs/datamodule/configuration/india_configuration.yaml
@@ -0,0 +1,106 @@
+# India PVNet Configuration
+# Complete configuration for training PVNet on India solar data with GFS NWP
+
+general:
+  description: India solar generation forecasting configuration
+  name: india_pvnet_config
+
+input_data:
+  # India Solar Generation Data (All-India national aggregate)
+  # Uses 'generation:' (generic PV data) not 'gsp:' (UK-specific Grid Supply Point)
+  generation:
+    # Local Zarr path - update to S3 path when uploaded
+    zarr_path: "C:/Users/asus vivoBook/Desktop/New folder (2)/pvnet-india-data/processed/india_solar_2024-2025.zarr"
+    interval_start_minutes: -60     # 1 hour history
+    interval_end_minutes: 480       # 8 hours forecast
+    time_resolution_minutes: 60     # Hourly data (60 min)
+    dropout_timedeltas_minutes: []
+    dropout_fraction: 0.0
+    public: false  # Local data
+
+  # GFS NWP Data for India region
+  nwp:
+    gfs:
+      time_resolution_minutes: 180  # 3-hourly GFS forecasts
+      interval_start_minutes: -180  # 3 hours before t0
+      interval_end_minutes: 540     # 9 hours after t0
+      dropout_fraction: 0.0
+      dropout_timedeltas_minutes: []
+      
+      # Global GFS data from OCF S3
+      zarr_path: "s3://ocf-open-data-pvnet/data/gfs/v4/2024.zarr"
+      provider: "gfs"
+      public: true
+      
+      # Spatial sampling (small patch around site)
+      # Note: ocf-data-sampler uses generation coordinates (lon/lat) for spatial sampling
+      image_size_pixels_height: 4
+      image_size_pixels_width: 4
+      
+      # Weather channels for solar prediction
+      channels:
+        - dlwrf   # downwards long-wave radiation flux
+        - dswrf   # downwards short-wave radiation flux (critical for solar)
+        - hcc     # high cloud cover
+        - lcc     # low cloud cover
+        - mcc     # medium cloud cover
+        - prate   # precipitation rate
+        - r       # relative humidity
+        - t       # 2-metre temperature
+        - tcc     # total cloud cover (critical for solar)
+        - u10     # 10-metre wind U component
+        - u100    # 100-metre wind U component
+        - v10     # 10-metre wind V component
+        - v100    # 100-metre wind V component
+        - vis     # visibility
+
+      # GFS normalisation constants (global stats)
+      normalisation_constants:
+        dlwrf:
+          mean: 298.342
+          std: 96.305916
+        dswrf:
+          mean: 168.12321
+          std: 246.18533
+        hcc:
+          mean: 35.272
+          std: 42.525383
+        lcc:
+          mean: 43.578342
+          std: 44.3732
+        mcc:
+          mean: 33.738823
+          std: 43.150745
+        prate:
+          mean: 2.8190969e-05
+          std: 0.00010159573
+        r:
+          mean: 18.359747
+          std: 25.440672
+        t:
+          mean: 278.5223
+          std: 22.825893
+        tcc:
+          mean: 66.841606
+          std: 41.030598
+        u10:
+          mean: -0.0022310058
+          std: 5.470838
+        u100:
+          mean: 0.0823025
+          std: 6.8899174
+        v10:
+          mean: 0.06219831
+          std: 4.7401133
+        v100:
+          mean: 0.0797807
+          std: 6.076132
+        vis:
+          mean: 19628.32
+          std: 8294.022
+
+  # Solar position input
+  solar_position:
+    interval_start_minutes: -60
+    interval_end_minutes: 480
+    time_resolution_minutes: 60
diff --git a/src/open_data_pvnet/configs/india_gfs_config.yaml b/src/open_data_pvnet/configs/india_gfs_config.yaml
new file mode 100644
index 0000000..d0fc9f6
--- /dev/null
+++ b/src/open_data_pvnet/configs/india_gfs_config.yaml
@@ -0,0 +1,92 @@
+general:
+  name: "india_gfs_config"
+  description: "Configuration for GFS NWP data sampling over India region"
+
+input_data:
+  nwp:
+    gfs:
+      # GFS provides 3-hourly forecasts globally
+      time_resolution_minutes: 180
+      interval_start_minutes: -180
+      interval_end_minutes: 540
+      dropout_timedeltas_minutes: null
+      dropout_fraction: 0.0
+      accum_channels: []
+      max_staleness_minutes: 540
+      
+      # Use existing OCF GFS data - filter to India bounds at runtime
+      zarr_path: "s3://ocf-open-data-pvnet/data/gfs/v4/2023.zarr"
+      provider: "gfs"
+      public: true
+      
+      # India bounding box (approximate)
+      # North: 38°N (Kashmir), South: 6°N (Kanyakumari)
+      # West: 68°E (Gujarat), East: 98°E (Arunachal Pradesh)
+      latitude_bounds: [6.0, 38.0]
+      longitude_bounds: [68.0, 98.0]
+      
+      # Spatial sampling
+      image_size_pixels_height: 2
+      image_size_pixels_width: 2
+      
+      # Weather channels for solar prediction
+      channels:
+        - dlwrf  # downwards long-wave radiation flux
+        - dswrf  # downwards short-wave radiation flux
+        - hcc    # high cloud cover
+        - lcc    # low cloud cover
+        - mcc    # medium cloud cover
+        - prate  # precipitation rate
+        - r      # relative humidity
+        - t      # 2-metre temperature
+        - tcc    # total cloud cover
+        - u10    # 10-metre wind U component
+        - u100   # 100-metre wind U component
+        - v10    # 10-metre wind V component
+        - v100   # 100-metre wind V component
+        - vis    # visibility
+
+      # Normalisation constants (using global GFS stats from UK config)
+      normalisation_constants:
+        dlwrf:
+          mean: 298.342
+          std: 96.305916
+        dswrf:
+          mean: 168.12321
+          std: 246.18533
+        hcc:
+          mean: 35.272
+          std: 42.525383
+        lcc:
+          mean: 43.578342
+          std: 44.3732
+        mcc:
+          mean: 33.738823
+          std: 43.150745
+        prate:
+          mean: 2.8190969e-05
+          std: 0.00010159573
+        r:
+          mean: 18.359747
+          std: 25.440672
+        t:
+          mean: 278.5223
+          std: 22.825893
+        tcc:
+          mean: 66.841606
+          std: 41.030598
+        u10:
+          mean: -0.0022310058
+          std: 5.470838
+        u100:
+          mean: 0.0823025
+          std: 6.8899174
+        v10:
+          mean: 0.06219831
+          std: 4.7401133
+        v100:
+          mean: 0.0797807
+          std: 6.076132
+        vis:
+          mean: 19628.32
+          std: 8294.022
diff --git a/src/open_data_pvnet/configs/india_pv_data_config.yaml b/src/open_data_pvnet/configs/india_pv_data_config.yaml
new file mode 100644
index 0000000..c9a61a4
--- /dev/null
+++ b/src/open_data_pvnet/configs/india_pv_data_config.yaml
@@ -0,0 +1,67 @@
+general:
+  name: "india_pv_config"
+  description: "India solar generation data configuration from Grid-India Mendeley dataset"
+
+input_data:
+  # India uses "gsp" structure but with regional IDs instead of UK GSP IDs
+  # region_id mapping:
+  #   0: Northern Region (NR) - Delhi/NCR
+  #   1: Western Region (WR) - Mumbai/Gujarat
+  #   2: Southern Region (SR) - Chennai/Karnataka
+  #   3: Eastern Region (ER) - Kolkata/Bihar
+  #   4: North-Eastern Region (NER) - Guwahati/Assam
+  
+  gsp:
+    # Path to processed India solar Zarr (to be uploaded after processing)
+    zarr_path: "data/india/india_solar_2021-2023.zarr"
+    
+    # Mendeley data is hourly (60 minutes)
+    time_resolution_minutes: 60
+    
+    # History and forecast windows
+    interval_start_minutes: -60   # 1 hour of history
+    interval_end_minutes: 480     # 8 hours of forecast
+    
+    # No dropout for initial training
+    dropout_timedeltas_minutes: []
+    dropout_fraction: 0.0
+    
+    public: true
+
+# India regional grid metadata
+# Coordinates are approximate centroids of each regional grid
+regions:
+  - region_id: 0
+    name: "Northern Region (NR)"
+    abbreviation: "NR"
+    latitude: 28.6139
+    longitude: 77.2090
+    states: ["Delhi", "Haryana", "Punjab", "Rajasthan", "UP", "UK", "HP", "J&K"]
+    
+  - region_id: 1
+    name: "Western Region (WR)"
+    abbreviation: "WR"
+    latitude: 19.0760
+    longitude: 72.8777
+    states: ["Maharashtra", "Gujarat", "MP", "Chhattisgarh", "Goa", "Daman & Diu"]
+    
+  - region_id: 2
+    name: "Southern Region (SR)"
+    abbreviation: "SR"
+    latitude: 13.0827
+    longitude: 80.2707
+    states: ["Tamil Nadu", "Karnataka", "Kerala", "Andhra Pradesh", "Telangana", "Puducherry"]
+    
+  - region_id: 3
+    name: "Eastern Region (ER)"
+    abbreviation: "ER"
+    latitude: 22.5726
+    longitude: 88.3639
+    states: ["West Bengal", "Bihar", "Jharkhand", "Odisha", "Sikkim"]
+    
+  - region_id: 4
+    name: "North-Eastern Region (NER)"
+    abbreviation: "NER"
+    latitude: 26.1445
+    longitude: 91.7362
+    states: ["Assam", "Arunachal Pradesh", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Tripura"]
diff --git a/src/open_data_pvnet/configs/india_regions.csv b/src/open_data_pvnet/configs/india_regions.csv
new file mode 100644
index 0000000..048d5cb
--- /dev/null
+++ b/src/open_data_pvnet/configs/india_regions.csv
@@ -0,0 +1,6 @@
+region_id,region_name,abbreviation,latitude,longitude,capacity_mw,states
+0,Northern Region,NR,28.6139,77.2090,,Delhi|Haryana|Punjab|Rajasthan|Uttar Pradesh|Uttarakhand|Himachal Pradesh|Jammu & Kashmir
+1,Western Region,WR,19.0760,72.8777,,Maharashtra|Gujarat|Madhya Pradesh|Chhattisgarh|Goa|Daman & Diu
+2,Southern Region,SR,13.0827,80.2707,,Tamil Nadu|Karnataka|Kerala|Andhra Pradesh|Telangana|Puducherry
+3,Eastern Region,ER,22.5726,88.3639,,West Bengal|Bihar|Jharkhand|Odisha|Sikkim
+4,North-Eastern Region,NER,26.1445,91.7362,,Assam|Arunachal Pradesh|Manipur|Meghalaya|Mizoram|Nagaland|Tripura
diff --git a/src/open_data_pvnet/nwp/gfs.py b/src/open_data_pvnet/nwp/gfs.py
index 364db23..3c7e809 100644
--- a/src/open_data_pvnet/nwp/gfs.py
+++ b/src/open_data_pvnet/nwp/gfs.py
@@ -1,8 +1,70 @@
+"""
+GFS NWP data processing for open-data-pvnet.
+
+Downloads NOAA GFS forecast data using Herbie (byte-range downloads)
+and converts to OCF-compatible Zarr format for PVNet training.
+
+Supports region-specific processing (India, UK, etc.) with configurable
+bounding boxes and channel selection.
+"""
+
 import logging
+import os
+from pathlib import Path
 
 logger = logging.getLogger(__name__)
 
 
-def process_gfs_data(year, month):
-    logger.info(f"Downloading GFS data for {year}-{month}")
-    raise NotImplementedError("The process_gfs_data function is not implemented yet.")
+def process_gfs_data(
+    year: int,
+    month: int,
+    region: str = "india",
+    output_dir: str | None = None,
+    max_days: int | None = None,
+) -> str:
+    """
+    Download and process GFS NWP data for a specific region and time period.
+
+    Uses Herbie for efficient byte-range downloads from NOAA S3,
+    extracting only the 14 OCF channels needed for PVNet.
+
+    Args:
+        year: Year to process
+        month: Month to process (1-12)
+        region: Target region ("india" or "uk")
+        output_dir: Output directory for Zarr files. Defaults to data/gfs_{region}/
+        max_days: Limit number of days (for testing)
+
+    Returns:
+        Path to the output Zarr file.
+
+    Raises:
+        ValueError: If region is not supported.
+        RuntimeError: If no data could be processed.
+    """
+    if region not in ("india", "uk"):
+        raise ValueError(f"Unsupported region: {region}. Use 'india' or 'uk'.")
+
+    if output_dir is None:
+        output_dir = f"data/gfs_{region}"
+
+    # Import here to avoid requiring herbie as a top-level dependency
+    from open_data_pvnet.scripts.download_gfs_india import process_month
+
+    logger.info(f"Processing GFS data for {region}: {year}-{month:02d}")
+
+    zarr_path = process_month(
+        year=year,
+        month=month,
+        output_dir=output_dir,
+        max_days=max_days,
+    )
+
+    if zarr_path is None:
+        raise RuntimeError(
+            f"No GFS data processed for {region} {year}-{month:02d}. "
+            "Check network connectivity and NOAA S3 availability."
+        )
+
+    logger.info(f"GFS data saved to {zarr_path}")
+    return zarr_path
diff --git a/src/open_data_pvnet/scripts/download_gfs_india.py b/src/open_data_pvnet/scripts/download_gfs_india.py
new file mode 100644
index 0000000..0a6e974
--- /dev/null
+++ b/src/open_data_pvnet/scripts/download_gfs_india.py
@@ -0,0 +1,611 @@
+"""
+Download and process NOAA GFS data for India region.
+
+Two download modes:
+  1. NOMADS GRIB filter (fast) — Selects specific variables + India subregion
+     in a single HTTP request. Returns ~100-200KB per file vs 300MB full GRIB.
+     Only available for last ~10 days of data.
+  2. Herbie byte-range (fallback) — For historical data from S3.
+     Uses .idx index files to download specific variables.
+
+Output: OCF-compatible Zarr with dims (init_time_utc, step, latitude, longitude)
+and 14 data variables matching existing GFS schema.
+
+Usage:
+    # Fast mode — recent data via NOMADS filter (recommended for testing)
+    python download_gfs_india.py --year 2026 --months 2 --max-days 1
+
+    # Historical data via Herbie S3 byte-range
+    python download_gfs_india.py --year 2024 --months 1 --max-days 1 --source herbie
+
+    # Parallel downloads (10 workers)
+    python download_gfs_india.py --year 2024 --months 1 --max-days 3 --workers 10
+
+Requirements:
+    pip install xarray cfgrib eccodes numpy pandas zarr requests
+    pip install herbie-data  # only needed for --source herbie
+"""
+
+import argparse
+import logging
+import os
+import tempfile
+import warnings
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import requests
+import xarray as xr
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+# --------------------------------------------------------------------------- #
+# OCF channel mapping
+# --------------------------------------------------------------------------- #
+
+# NOMADS uses different parameter names than GRIB shortnames
+# Format: ocf_name -> (nomads_var_param, herbie_search_regex, description)
+OCF_CHANNELS = {
+    "dlwrf": {
+        "nomads": "DLWRF",
+        "search": ":DLWRF:surface",
+        "level": "surface",
+    },
+    "dswrf": {
+        "nomads": "DSWRF",
+        "search": ":DSWRF:surface",
+        "level": "surface",
+    },
+    "hcc": {
+        "nomads": "HCDC",
+        "search": ":HCDC:high cloud layer:(?!.*ave)",
+        "level": "high_cloud_layer",
+    },
+    "lcc": {
+        "nomads": "LCDC",
+        "search": ":LCDC:low cloud layer:(?!.*ave)",
+        "level": "low_cloud_layer",
+    },
+    "mcc": {
+        "nomads": "MCDC",
+        "search": ":MCDC:middle cloud layer:(?!.*ave)",
+        "level": "middle_cloud_layer",
+    },
+    "prate": {
+        "nomads": "PRATE",
+        "search": ":PRATE:surface:(?!.*ave)",
+        "level": "surface",
+    },
+    "r": {
+        "nomads": "RH",
+        "search": ":RH:850 mb",
+        "level": "850_mb",
+    },
+    "t": {
+        "nomads": "TMP",
+        "search": ":TMP:2 m above ground",
+        "level": "2_m_above_ground",
+    },
+    "tcc": {
+        "nomads": "TCDC",
+        "search": ":TCDC:entire atmosphere:(?!.*ave)",
+        "level": "entire_atmosphere_(considered_as_a_single_layer)",
+    },
+    "u10": {
+        "nomads": "UGRD",
+        "search": ":UGRD:10 m above ground",
+        "level": "10_m_above_ground",
+    },
+    "u100": {
+        "nomads": "UGRD",
+        "search": ":UGRD:100 m above ground",
+        "level": "100_m_above_ground",
+    },
+    "v10": {
+        "nomads": "VGRD",
+        "search": ":VGRD:10 m above ground",
+        "level": "10_m_above_ground",
+    },
+    "v100": {
+        "nomads": "VGRD",
+        "search": ":VGRD:100 m above ground",
+        "level": "100_m_above_ground",
+    },
+    "vis": {
+        "nomads": "VIS",
+        "search": ":VIS:surface",
+        "level": "surface",
+    },
+}
+
+# India bounding box (with 1° buffer)
+INDIA_LAT_MIN = 5.0
+INDIA_LAT_MAX = 39.0
+INDIA_LON_MIN = 67.0
+INDIA_LON_MAX = 99.0
+
+# GFS forecast hours (17 steps: 0-48h at 3h intervals)
+FORECAST_HOURS = list(range(0, 49, 3))
+
+# GFS initialization hours (4x daily)
+INIT_HOURS = [0, 6, 12, 18]
+
+# NOMADS base URL
+NOMADS_BASE = "https://nomads.ncep.noaa.gov/cgi-bin/filter_gfs_0p25.pl"
+
+
+# --------------------------------------------------------------------------- #
+# NOMADS GRIB Filter — fast, subregion-aware downloads
+# --------------------------------------------------------------------------- #
+
+def build_nomads_url(date: datetime, init_hour: int, fxx: int) -> str:
+    """
+    Build NOMADS GRIB filter URL for India-subset GFS download.
+
+    Downloads ALL 14 OCF variables + India subregion in a single request.
+    Returns ~100-200KB GRIB file instead of 300MB full global file.
+    """
+    date_str = date.strftime("%Y%m%d")
+
+    params = {
+        "dir": f"/gfs.{date_str}/{init_hour:02d}/atmos",
+        "file": f"gfs.t{init_hour:02d}z.pgrb2.0p25.f{fxx:03d}",
+        # Subregion — India with buffer
+        "subregion": "",
+        "toplat": str(INDIA_LAT_MAX),
+        "bottomlat": str(INDIA_LAT_MIN),
+        "leftlon": str(INDIA_LON_MIN),
+        "rightlon": str(INDIA_LON_MAX),
+    }
+
+    # Add all variable selections
+    nomads_vars = set()
+    for spec in OCF_CHANNELS.values():
+        nomads_vars.add(spec["nomads"])
+    for var in sorted(nomads_vars):
+        params[f"var_{var}"] = "on"
+
+    # Add level selections
+    nomads_levels = set()
+    for spec in OCF_CHANNELS.values():
+        nomads_levels.add(spec["level"])
+    for level in sorted(nomads_levels):
+        params[f"lev_{level}"] = "on"
+
+    # Build URL manually (NOMADS is finicky about param order)
+    param_str = "&".join(f"{k}={v}" for k, v in params.items())
+    return f"{NOMADS_BASE}?{param_str}"
+
+
+def download_nomads_step(
+    date: datetime,
+    init_hour: int,
+    fxx: int,
+    tmp_dir: str,
+    timeout: int = 60,
+) -> str | None:
+    """Download a single forecast step via NOMADS grib filter."""
+    url = build_nomads_url(date, init_hour, fxx)
+    fname = f"gfs_{date.strftime('%Y%m%d')}_{init_hour:02d}z_f{fxx:03d}.grib2"
+    local_path = os.path.join(tmp_dir, fname)
+
+    if os.path.exists(local_path) and os.path.getsize(local_path) > 1000:
+        return local_path
+
+    try:
+        resp = requests.get(url, timeout=timeout)
+        if resp.status_code == 200 and len(resp.content) > 1000:
+            with open(local_path, "wb") as f:
+                f.write(resp.content)
+            size_kb = len(resp.content) / 1024
+            logger.debug(f"  Downloaded f{fxx:03d}: {size_kb:.0f} KB")
+            return local_path
+        else:
+            logger.debug(f"  f{fxx:03d}: HTTP {resp.status_code} or empty")
+            return None
+    except Exception as e:
+        logger.debug(f"  f{fxx:03d}: download failed ({e})")
+        return None
+
+
+def extract_variables_from_grib(grib_path: str) -> dict[str, xr.DataArray]:
+    """Extract OCF variables from a subsetted GRIB file."""
+    variables = {}
+
+    for ocf_name, spec in OCF_CHANNELS.items():
+        try:
+            ds = xr.open_dataset(
+                grib_path,
+                engine="cfgrib",
+                backend_kwargs={
+                    "filter_by_keys": {
+                        "shortName": spec["nomads"].lower()
+                        if spec["nomads"] not in ("HCDC", "LCDC", "MCDC")
+                        else spec["nomads"].lower(),
+                    },
+                    "errors": "ignore",
+                },
+            )
+
+            if len(ds.data_vars) == 0:
+                continue
+
+            var_name = list(ds.data_vars)[0]
+            da = ds[var_name].load()
+
+            # Drop extra coords
+            keep = {"latitude", "longitude"}
+            drop = [c for c in da.coords if c not in keep]
+            da = da.drop_vars(drop, errors="ignore")
+            da.name = ocf_name
+
+            variables[ocf_name] = da.astype(np.float32)
+            ds.close()
+
+        except Exception:
+            pass
+
+    return variables
+
+
+# --------------------------------------------------------------------------- #
+# Herbie byte-range downloads — for historical S3 data
+# --------------------------------------------------------------------------- #
+
+def download_herbie_step(
+    date_str: str,
+    init_hour: int,
+    fxx: int,
+    channels: list[str],
+) -> dict[str, xr.DataArray]:
+    """Download variables via Herbie byte-range from S3."""
+    from herbie import Herbie
+
+    variables = {}
+    try:
+        H = Herbie(
+            date_str,
+            model="gfs",
+            fxx=fxx,
+            product="pgrb2.0p25",
+            verbose=False,
+        )
+
+        for ch_name in channels:
+            spec = OCF_CHANNELS[ch_name]
+            try:
+                ds = H.xarray(spec["search"], verbose=False)
+                if ds is None or len(ds.data_vars) == 0:
+                    continue
+
+                var_name = list(ds.data_vars)[0]
+                da = ds[var_name].load()
+
+                # Subset to India
+                if float(da.longitude.max()) > 180:
+                    da = da.sel(
+                        latitude=slice(INDIA_LAT_MAX, INDIA_LAT_MIN),
+                        longitude=slice(INDIA_LON_MIN, INDIA_LON_MAX),
+                    )
+                else:
+                    da = da.sel(
+                        latitude=slice(INDIA_LAT_MAX, INDIA_LAT_MIN),
+                        longitude=slice(INDIA_LON_MIN, INDIA_LON_MAX),
+                    )
+
+                keep = {"latitude", "longitude"}
+                drop = [c for c in da.coords if c not in keep]
+                da = da.drop_vars(drop, errors="ignore")
+                da.name = ch_name
+                variables[ch_name] = da.astype(np.float32)
+
+            except Exception:
+                pass
+
+    except Exception as e:
+        logger.warning(f"  Herbie init failed for f{fxx:03d}: {e}")
+
+    return variables
+
+
+# --------------------------------------------------------------------------- #
+# Core processing pipeline
+# --------------------------------------------------------------------------- #
+
+def process_single_init_time(
+    date: datetime,
+    init_hour: int,
+    source: str = "nomads",
+    workers: int = 6,
+    channels: list[str] | None = None,
+) -> xr.Dataset | None:
+    """
+    Process all forecast steps for a single GFS init time.
+
+    Args:
+        date: Date to process
+        init_hour: Init hour (0, 6, 12, 18)
+        source: "nomads" (fast, recent data) or "herbie" (historical S3)
+        workers: Number of parallel download workers
+        channels: Channel subset (default: all 14)
+
+    Returns:
+        xr.Dataset with dims (init_time_utc, step, latitude, longitude)
+    """
+    if channels is None:
+        channels = list(OCF_CHANNELS.keys())
+
+    init_time = pd.Timestamp(date.strftime("%Y-%m-%d")) + pd.Timedelta(
+        hours=init_hour
+    )
+    logger.info(f"Processing {init_time} [{source}] "
+                f"({len(channels)}ch × {len(FORECAST_HOURS)}steps, "
+                f"{workers} workers)")
+
+    step_datasets = []
+
+    if source == "nomads":
+        # NOMADS: download subsetted GRIB files in parallel
+        with tempfile.TemporaryDirectory(prefix="gfs_india_") as tmp_dir:
+            # Parallel download
+            grib_paths = {}
+            with ThreadPoolExecutor(max_workers=workers) as executor:
+                futures = {
+                    executor.submit(
+                        download_nomads_step, date, init_hour, fxx, tmp_dir
+                    ): fxx
+                    for fxx in FORECAST_HOURS
+                }
+                for future in as_completed(futures):
+                    fxx = futures[future]
+                    try:
+                        path = future.result()
+                        if path:
+                            grib_paths[fxx] = path
+                    except Exception as e:
+                        logger.debug(f"  f{fxx:03d}: {e}")
+
+            logger.info(f"  Downloaded {len(grib_paths)}/{len(FORECAST_HOURS)} steps")
+
+            # Extract variables from each GRIB (parallel)
+            for fxx in FORECAST_HOURS:
+                if fxx not in grib_paths:
+                    continue
+                variables = extract_variables_from_grib(grib_paths[fxx])
+                if not variables:
+                    continue
+
+                step_ds = xr.Dataset(variables)
+                step_td = np.timedelta64(fxx, "h")
+                step_ds = step_ds.expand_dims({"step": [step_td]})
+                step_datasets.append(step_ds)
+
+    else:
+        # Herbie: byte-range downloads from S3
+        date_str = date.strftime("%Y-%m-%d")
+        for fxx in FORECAST_HOURS:
+            variables = download_herbie_step(date_str, init_hour, fxx, channels)
+            if not variables:
+                logger.debug(f"  f{fxx:03d}: no variables")
+                continue
+
+            step_ds = xr.Dataset(variables)
+            step_td = np.timedelta64(fxx, "h")
+            step_ds = step_ds.expand_dims({"step": [step_td]})
+            step_datasets.append(step_ds)
+
+            n_ok = len(variables)
+            logger.info(f"  f{fxx:03d}: {n_ok}/{len(channels)} channels OK")
+
+    if not step_datasets:
+        logger.warning(f"  No valid steps for {init_time}")
+        return None
+
+    combined = xr.concat(step_datasets, dim="step")
+    combined = combined.expand_dims({"init_time_utc": [init_time]})
+
+    logger.info(f"  ✓ {init_time}: {len(step_datasets)} steps, "
+                f"{len(combined.data_vars)} channels")
+    return combined
+
+
+def process_month(
+    year: int,
+    month: int,
+    output_dir: str,
+    max_days: int | None = None,
+    source: str = "nomads",
+    workers: int = 6,
+    channels: list[str] | None = None,
+    dry_run: bool = False,
+) -> str | None:
+    """
+    Process one month of GFS data for India and save as Zarr.
+
+    Args:
+        year: Year to process
+        month: Month to process (1-12)
+        output_dir: Directory for output Zarr files
+        max_days: Limit days per month (testing)
+        source: "nomads" or "herbie"
+        workers: Parallel download workers
+        channels: Channel subset
+        dry_run: Verify availability without downloading
+
+    Returns:
+        Path to output Zarr, or None.
+    """
+    start = datetime(year, month, 1)
+    end = datetime(year + (month // 12), (month % 12) + 1, 1)
+    dates = []
+    current = start
+    while current < end:
+        dates.append(current)
+        current += timedelta(days=1)
+
+    if max_days:
+        dates = dates[:max_days]
+
+    n_init = len(dates) * len(INIT_HOURS)
+    logger.info(f"{'[DRY RUN] ' if dry_run else ''}"
+                f"{year}-{month:02d}: {len(dates)} days, {n_init} init times "
+                f"[{source}, {workers} workers]")
+
+    if dry_run:
+        if source == "nomads":
+            url = build_nomads_url(dates[0], 0, 3)
+            try:
+                resp = requests.head(url, timeout=10)
+                logger.info(f"  NOMADS: HTTP {resp.status_code}")
+            except Exception as e:
+                logger.warning(f"  NOMADS: {e}")
+        return None
+
+    all_datasets = []
+
+    for date in dates:
+        for init_hour in INIT_HOURS:
+            try:
+                ds = process_single_init_time(
+                    date, init_hour, source, workers, channels
+                )
+                if ds is not None:
+                    all_datasets.append(ds)
+            except Exception as e:
+                logger.error(f"Failed {date.strftime('%Y-%m-%d')} "
+                             f"{init_hour:02d}Z: {e}")
+
+    if not all_datasets:
+        logger.warning(f"No data processed for {year}-{month:02d}")
+        return None
+
+    logger.info(f"Combining {len(all_datasets)} init times...")
+    combined = xr.concat(all_datasets, dim="init_time_utc")
+    combined = combined.sortby("init_time_utc")
+
+    # Ensure latitude descending (N→S, matching OCF convention)
+    if combined.latitude[0] < combined.latitude[-1]:
+        combined = combined.isel(latitude=slice(None, None, -1))
+
+    # Save as Zarr
+    output_path = os.path.join(output_dir, f"india_gfs_{year}_{month:02d}.zarr")
+    os.makedirs(output_dir, exist_ok=True)
+
+    logger.info(f"Saving {output_path}...")
+    logger.info(f"  Dims: {dict(combined.dims)}")
+    logger.info(f"  Channels: {list(combined.data_vars)}")
+    lat_min = float(combined.latitude.min())
+    lat_max = float(combined.latitude.max())
+    lon_min = float(combined.longitude.min())
+    lon_max = float(combined.longitude.max())
+    logger.info(f"  Lat: {lat_min:.1f} to {lat_max:.1f}")
+    logger.info(f"  Lon: {lon_min:.1f} to {lon_max:.1f}")
+
+    combined.to_zarr(output_path, mode="w", consolidated=True)
+    logger.info(f"✓ Saved: {output_path}")
+
+    return output_path
+
+
+def merge_monthly_zarrs(zarr_paths: list[str], output_path: str) -> str:
+    """Merge monthly Zarr files into a single yearly Zarr."""
+    logger.info(f"Merging {len(zarr_paths)} files → {output_path}")
+    datasets = [xr.open_zarr(p) for p in zarr_paths]
+    combined = xr.concat(datasets, dim="init_time_utc")
+    combined = combined.sortby("init_time_utc")
+    combined.to_zarr(output_path, mode="w", consolidated=True)
+    logger.info(f"✓ Merged: {combined.dims['init_time_utc']} init times")
+    return output_path
+
+
+def validate_zarr(zarr_path: str) -> bool:
+    """Validate Zarr matches OCF GFS schema."""
+    logger.info(f"Validating {zarr_path}...")
+    ds = xr.open_zarr(zarr_path)
+
+    required_dims = {"init_time_utc", "step", "latitude", "longitude"}
+    actual_dims = set(ds.dims)
+    assert required_dims.issubset(actual_dims), \
+        f"Missing dims: {required_dims - actual_dims}"
+
+    expected = set(OCF_CHANNELS.keys())
+    actual = set(ds.data_vars)
+    missing = expected - actual
+    if missing:
+        logger.warning(f"  Missing channels: {missing}")
+    else:
+        logger.info("  ✓ All 14 channels present")
+
+    assert float(ds.latitude.min()) <= 8.0
+    assert float(ds.latitude.max()) >= 36.0
+    assert float(ds.longitude.min()) <= 70.0
+    assert float(ds.longitude.max()) >= 96.0
+    logger.info("  ✓ Spatial coverage OK (India)")
+
+    for var in ds.data_vars:
+        assert ds[var].dtype == np.float32
+    logger.info("  ✓ float32 types OK")
+
+    logger.info("  ✓ Validation passed")
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download NOAA GFS data for India → OCF Zarr"
+    )
+    parser.add_argument("--year", type=int, required=True)
+    parser.add_argument("--months", type=int, nargs="+", required=True)
+    parser.add_argument("--output-dir", type=str, default="data/gfs_india")
+    parser.add_argument("--max-days", type=int, default=None)
+    parser.add_argument("--source", choices=["nomads", "herbie"], default="nomads",
+                        help="nomads=GRIB filter (fast, recent), "
+                             "herbie=S3 byte-range (historical)")
+    parser.add_argument("--workers", type=int, default=6,
+                        help="Parallel download workers")
+    parser.add_argument("--channels", type=str, nargs="+", default=None)
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--merge", action="store_true")
+    parser.add_argument("--validate", type=str, default=None)
+
+    args = parser.parse_args()
+
+    if args.validate:
+        validate_zarr(args.validate)
+        return
+
+    monthly_paths = []
+    for month in args.months:
+        path = process_month(
+            year=args.year,
+            month=month,
+            output_dir=args.output_dir,
+            max_days=args.max_days,
+            source=args.source,
+            workers=args.workers,
+            channels=args.channels,
+            dry_run=args.dry_run,
+        )
+        if path:
+            monthly_paths.append(path)
+
+    if args.merge and len(monthly_paths) > 1:
+        yearly = os.path.join(args.output_dir, f"india_gfs_{args.year}.zarr")
+        merge_monthly_zarrs(monthly_paths, yearly)
+        validate_zarr(yearly)
+    elif monthly_paths:
+        validate_zarr(monthly_paths[-1])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/open_data_pvnet/scripts/download_mendeley_india.py b/src/open_data_pvnet/scripts/download_mendeley_india.py
new file mode 100644
index 0000000..70a0b87
--- /dev/null
+++ b/src/open_data_pvnet/scripts/download_mendeley_india.py
@@ -0,0 +1,255 @@
+"""
+India PVNet Data Download Script
+
+This script provides utilities for downloading and processing India solar generation data
+from various sources for PVNet training.
+
+Data Sources:
+1. Mendeley Dataset (DOI: 10.17632/y58jknpgs8.2)
+   - Hourly data from Grid-India NERLDC
+   - Sep 2021 - Dec 2023
+   - 5 regional grids (NR, WR, SR, ER, NER)
+
+2. Kaggle Solar Power Generation (backup)
+   - 15-min data from 2 Indian plants
+   - 34 days of data
+   - https://www.kaggle.com/datasets/anikannal/solar-power-generation-data
+
+Usage:
+    # After manual download from Mendeley:
+    python download_mendeley_india.py --process --input-dir raw_data/
+
+    # For Kaggle data (requires kaggle API key):
+    python download_mendeley_india.py --kaggle
+"""
+
+import os
+import sys
+import pandas as pd
+import xarray as xr
+from pathlib import Path
+from datetime import datetime
+import logging
+import argparse
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Configuration
+DATA_DIR = Path("c:/Users/asus vivoBook/Desktop/New folder (2)/pvnet-india-data")
+RAW_DIR = DATA_DIR / "raw"
+PROCESSED_DIR = DATA_DIR / "processed"
+
+# Mendeley dataset info
+MENDELEY_DOI = "10.17632/y58jknpgs8.2"
+MENDELEY_URL = f"https://data.mendeley.com/datasets/y58jknpgs8/2"
+
+# India regional grid metadata
+INDIA_REGIONS = {
+    "NR": {"name": "Northern Region", "lat": 28.6139, "lon": 77.2090},
+    "WR": {"name": "Western Region", "lat": 19.0760, "lon": 72.8777},
+    "SR": {"name": "Southern Region", "lat": 13.0827, "lon": 80.2707},
+    "ER": {"name": "Eastern Region", "lat": 22.5726, "lon": 88.3639},
+    "NER": {"name": "North-Eastern Region", "lat": 26.1445, "lon": 91.7362},
+}
+
+
+def print_download_instructions():
+    """Print manual download instructions for Mendeley dataset."""
+    instructions = f"""
+╔══════════════════════════════════════════════════════════════════════════════╗
+║                    MENDELEY INDIA DATASET DOWNLOAD GUIDE                     ║
+╠══════════════════════════════════════════════════════════════════════════════╣
+║                                                                              ║
+║  The Mendeley API has known issues. Please download manually:               ║
+║                                                                              ║
+║  1. Open browser and go to:                                                  ║
+║     {MENDELEY_URL:<55}║
+║                                                                              ║
+║  2. Click "Download all files" button (usually a ZIP archive)               ║
+║                                                                              ║
+║  3. Save to:                                                                 ║
+║     {str(RAW_DIR):<55}║
+║                                                                              ║
+║  4. Extract the ZIP file in the same directory                              ║
+║                                                                              ║
+║  5. Run this script again with --process flag:                              ║
+║     python download_mendeley_india.py --process                             ║
+║                                                                              ║
+║  Dataset DOI: {MENDELEY_DOI:<50}║
+║                                                                              ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+    """
+    print(instructions)
+
+
+def download_kaggle_dataset():
+    """Download Kaggle solar power generation dataset as a backup."""
+    try:
+        import kaggle
+        kaggle.api.authenticate()
+        
+        kaggle_dir = RAW_DIR / "kaggle"
+        kaggle_dir.mkdir(parents=True, exist_ok=True)
+        
+        logger.info("Downloading Kaggle solar power generation dataset...")
+        kaggle.api.dataset_download_files(
+            "anikannal/solar-power-generation-data",
+            path=str(kaggle_dir),
+            unzip=True
+        )
+        logger.info(f"Dataset downloaded to {kaggle_dir}")
+        return True
+    except ImportError:
+        logger.error("Kaggle package not installed. Run: pip install kaggle")
+        return False
+    except Exception as e:
+        logger.error(f"Kaggle download failed: {e}")
+        logger.info("Ensure ~/.kaggle/kaggle.json has valid API credentials")
+        return False
+
+
+def find_mendeley_files(input_dir: Path) -> list:
+    """Find Mendeley data files in the input directory."""
+    extensions = ['.xlsx', '.xls', '.csv', '.parquet']
+    files = []
+    for ext in extensions:
+        files.extend(input_dir.glob(f'*{ext}'))
+        files.extend(input_dir.glob(f'**/*{ext}'))
+    return files
+
+
+def load_mendeley_data(file_path: Path) -> pd.DataFrame:
+    """Load Mendeley data file into pandas DataFrame."""
+    logger.info(f"Loading: {file_path}")
+    
+    if file_path.suffix in ['.xlsx', '.xls']:
+        # Try reading Excel file
+        df = pd.read_excel(file_path)
+    elif file_path.suffix == '.csv':
+        df = pd.read_csv(file_path)
+    elif file_path.suffix == '.parquet':
+        df = pd.read_parquet(file_path)
+    else:
+        raise ValueError(f"Unsupported file format: {file_path.suffix}")
+    
+    logger.info(f"Loaded {len(df)} rows, columns: {list(df.columns)}")
+    return df
+
+
+def process_mendeley_data(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Process Mendeley data to standardize format for PVNet.
+    
+    Expected columns after processing:
+    - datetime_gmt: Timestamp in UTC
+    - region_id: Integer ID for each region (0-4)
+    - generation_mw: Solar generation in MW
+    - capacity_mw: Installed capacity in MW (if available)
+    """
+    logger.info("Processing Mendeley data...")
+    
+    # Normalize column names
+    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
+    
+    # Log available columns for inspection
+    logger.info(f"Available columns: {list(df.columns)}")
+    
+    # TODO: Actual column mapping will depend on the downloaded file structure
+    # This is a placeholder that will be updated after inspecting the actual data
+    
+    return df
+
+
+def convert_to_zarr(df: pd.DataFrame, output_path: Path):
+    """Convert processed DataFrame to Zarr format for PVNet."""
+    logger.info(f"Converting to Zarr: {output_path}")
+    
+    # Convert to xarray Dataset
+    ds = xr.Dataset.from_dataframe(df.set_index(['region_id', 'datetime_gmt']))
+    
+    # Chunk appropriately
+    ds = ds.chunk({'region_id': 1, 'datetime_gmt': 1000})
+    
+    # Save to Zarr
+    ds.to_zarr(str(output_path), mode='w', consolidated=True)
+    logger.info(f"Saved Zarr dataset to {output_path}")
+
+
+def validate_data(df: pd.DataFrame):
+    """Validate data quality and print summary statistics."""
+    logger.info("\n" + "="*60)
+    logger.info("DATA VALIDATION REPORT")
+    logger.info("="*60)
+    
+    # Basic stats
+    logger.info(f"\nShape: {df.shape}")
+    logger.info(f"Columns: {list(df.columns)}")
+    logger.info(f"\nData types:\n{df.dtypes}")
+    
+    # Missing values
+    missing = df.isnull().sum()
+    if missing.any():
+        logger.warning(f"\nMissing values:\n{missing[missing > 0]}")
+    else:
+        logger.info("\n✓ No missing values")
+    
+    # Date range
+    date_cols = df.select_dtypes(include=['datetime64']).columns
+    for col in date_cols:
+        logger.info(f"\n{col} range: {df[col].min()} to {df[col].max()}")
+    
+    # Numeric summaries
+    logger.info(f"\nNumeric summary:\n{df.describe()}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="India PVNet Data Download & Processing")
+    parser.add_argument('--process', action='store_true', help='Process downloaded Mendeley data')
+    parser.add_argument('--kaggle', action='store_true', help='Download Kaggle backup dataset')
+    parser.add_argument('--input-dir', type=str, default=str(RAW_DIR), help='Input directory with raw data')
+    parser.add_argument('--validate', action='store_true', help='Validate existing data')
+    
+    args = parser.parse_args()
+    
+    # Create directories
+    RAW_DIR.mkdir(parents=True, exist_ok=True)
+    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+    
+    if args.kaggle:
+        download_kaggle_dataset()
+        return
+    
+    if args.process:
+        input_dir = Path(args.input_dir)
+        files = find_mendeley_files(input_dir)
+        
+        if not files:
+            logger.error(f"No data files found in {input_dir}")
+            print_download_instructions()
+            return
+        
+        logger.info(f"Found {len(files)} data file(s)")
+        
+        all_data = []
+        for f in files:
+            try:
+                df = load_mendeley_data(f)
+                all_data.append(df)
+            except Exception as e:
+                logger.error(f"Failed to load {f}: {e}")
+        
+        if all_data:
+            combined_df = pd.concat(all_data, ignore_index=True)
+            validate_data(combined_df)
+            
+            # Save intermediate CSV
+            combined_df.to_csv(PROCESSED_DIR / "india_solar_combined.csv", index=False)
+            logger.info(f"Saved combined data to {PROCESSED_DIR / 'india_solar_combined.csv'}")
+    else:
+        # Default: print download instructions
+        print_download_instructions()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/open_data_pvnet/scripts/process_india_data.py b/src/open_data_pvnet/scripts/process_india_data.py
new file mode 100644
index 0000000..b624269
--- /dev/null
+++ b/src/open_data_pvnet/scripts/process_india_data.py
@@ -0,0 +1,239 @@
+"""
+India Data Processor v3 - Handle both SCADA-coded and readable column formats
+
+- Jan 2024-Jun 2025: Readable columns (Timestamp, Demand (MW), Solar (MW))  
+- Sep 2021-Dec 2023: SCADA codes that need mapping
+"""
+
+import pandas as pd
+import xarray as xr
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+import logging
+import warnings
+warnings.filterwarnings('ignore')
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+BASE_DIR = Path(r"C:\Users\asus vivoBook\Desktop\New folder (2)\pvnet-india-data")
+RAW_DIR = BASE_DIR / "raw" / "Electricity Demand, Solar and Wind Generation Data" / "Electricity Demand, Solar and Wind Generation Data"
+PROCESSED_DIR = BASE_DIR / "processed"
+
+
+def load_2024_2025_file(file_path: Path) -> pd.DataFrame:
+    """Load the Jan 2024 - Jun 2025 file with readable columns."""
+    logger.info(f"Loading 2024-2025 data: {file_path.name}")
+    
+    xl = pd.ExcelFile(file_path)
+    df = xl.parse('Report', header=0)
+    
+    # Column mapping
+    col_map = {}
+    for col in df.columns:
+        col_lower = str(col).lower()
+        if 'timestamp' in col_lower or 'time' in col_lower:
+            col_map[col] = 'datetime'
+        elif 'solar' in col_lower:
+            col_map[col] = 'solar_generation_mw'
+        elif 'wind' in col_lower:
+            col_map[col] = 'wind_generation_mw'
+        elif 'demand' in col_lower:
+            col_map[col] = 'demand_mw'
+    
+    df = df.rename(columns=col_map)
+    
+    if 'datetime' in df.columns:
+        df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
+        df = df.dropna(subset=['datetime'])
+    
+    # Convert to numeric
+    for col in ['solar_generation_mw', 'wind_generation_mw', 'demand_mw']:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+    
+    logger.info(f"  Loaded {len(df)} rows from 2024-2025 data")
+    
+    # Show columns found
+    found = [c for c in ['solar_generation_mw', 'wind_generation_mw', 'demand_mw'] if c in df.columns]
+    logger.info(f"  Columns: {found}")
+    
+    return df
+
+
+def load_scada_file(file_path: Path, solar_col_idx: int = None, wind_col_idx: int = None) -> pd.DataFrame:
+    """Load old SCADA-coded file with known column indices."""
+    logger.info(f"Loading SCADA file: {file_path.name}")
+    
+    try:
+        xl = pd.ExcelFile(file_path)
+        if 'Sheet1' not in xl.sheet_names:
+            logger.warning(f"  No Sheet1 in {file_path.name}")
+            return pd.DataFrame()
+        
+        df = xl.parse('Sheet1', header=0)
+        
+        # First column is Time
+        time_col = df.columns[0]
+        df['datetime'] = pd.to_datetime(df[time_col], errors='coerce')
+        df = df.dropna(subset=['datetime'])
+        
+        # Use known column indices if provided
+        result = {'datetime': df['datetime']}
+        
+        if solar_col_idx is not None and solar_col_idx < len(df.columns):
+            result['solar_generation_mw'] = pd.to_numeric(df.iloc[:, solar_col_idx], errors='coerce')
+        
+        if wind_col_idx is not None and wind_col_idx < len(df.columns):
+            result['wind_generation_mw'] = pd.to_numeric(df.iloc[:, wind_col_idx], errors='coerce')
+        
+        result_df = pd.DataFrame(result)
+        logger.info(f"  Loaded {len(result_df)} rows")
+        return result_df
+        
+    except Exception as e:
+        logger.error(f"Failed to load {file_path.name}: {e}")
+        return pd.DataFrame()
+
+
+def main():
+    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+    
+    # Start with the 2024-2025 file which has clear columns
+    file_2024_2025 = RAW_DIR / "January 2024- June 2025.xlsx"
+    
+    all_data = []
+    
+    if file_2024_2025.exists():
+        df = load_2024_2025_file(file_2024_2025)
+        if not df.empty and 'solar_generation_mw' in df.columns:
+            all_data.append(df)
+    
+    # For now, skip the SCADA-coded files until we find the column mapping
+    # TODO: Add SCADA column index mapping once identified
+    
+    logger.info(f"\nSuccessfully loaded {len(all_data)} files with solar data")
+    
+    if not all_data:
+        logger.error("No valid data found!")
+        return
+    
+    # Combine
+    combined = pd.concat(all_data, ignore_index=True)
+    combined = combined.sort_values('datetime').drop_duplicates(subset=['datetime'])
+    
+    logger.info(f"\nCombined dataset: {len(combined)} rows")
+    logger.info(f"Date range: {combined['datetime'].min()} to {combined['datetime'].max()}")
+    
+    # Rename columns to ocf-data-sampler expected format
+    # See: https://github.com/openclimatefix/ocf-data-sampler/blob/main/ocf_data_sampler/load/generation.py
+    hourly = combined.copy()
+    hourly = hourly.rename(columns={
+        'datetime': 'time_utc',
+        'solar_generation_mw': 'generation_mw'
+    })
+    
+    # Use location_id instead of region_id (ocf-data-sampler expects location_id)
+    hourly['location_id'] = 0  # 0 = All India aggregate
+    
+    # Add required coordinates for ocf-data-sampler
+    # India center approx: 20°N, 78°E (used for GFS NWP extraction)
+    hourly['longitude'] = 78.0  # India center longitude
+    hourly['latitude'] = 20.0   # India center latitude
+    
+    # Calculate capacity_mwp from peak observed values
+    # India solar installed capacity ~70GW as of 2024
+    # Use 95th percentile of observed values as proxy for capacity
+    solar_capacity_mwp = hourly['generation_mw'].quantile(0.95)
+    hourly['capacity_mwp'] = solar_capacity_mwp
+    
+    logger.info(f"Hourly dataset: {len(hourly)} rows")
+    logger.info(f"  Estimated capacity: {solar_capacity_mwp:.0f} MWp")
+    
+    # Save CSV with original column names for reference
+    csv_data = hourly.copy()
+    csv_path = PROCESSED_DIR / "india_solar_hourly.csv"
+    csv_data.to_csv(csv_path, index=False)
+    logger.info(f"Saved CSV: {csv_path}")
+    
+    # Create xarray Dataset with ocf-data-sampler schema
+    # Dimensions: (time_utc, location_id)
+    # Data Variables: generation_mw, capacity_mwp
+    # Coordinates: time_utc, location_id, longitude, latitude
+    
+    time_utc = pd.to_datetime(hourly['time_utc']).values
+    location_ids = hourly['location_id'].unique()
+    
+    # Create DataArrays
+    generation_mw = xr.DataArray(
+        data=hourly['generation_mw'].values.reshape(1, -1),  # (location, time)
+        dims=['location_id', 'time_utc'],
+        coords={
+            'location_id': ('location_id', location_ids),
+            'time_utc': ('time_utc', time_utc),
+            'longitude': ('location_id', [78.0]),
+            'latitude': ('location_id', [20.0]),
+        }
+    )
+    
+    capacity_mwp = xr.DataArray(
+        data=np.full((1, len(time_utc)), solar_capacity_mwp),  # Same capacity for all times
+        dims=['location_id', 'time_utc'],
+        coords={
+            'location_id': ('location_id', location_ids),
+            'time_utc': ('time_utc', time_utc),
+            'longitude': ('location_id', [78.0]),
+            'latitude': ('location_id', [20.0]),
+        }
+    )
+    
+    ds = xr.Dataset({
+        'generation_mw': generation_mw,
+        'capacity_mwp': capacity_mwp,
+    })
+    
+    ds.attrs = {
+        'description': 'India All-India solar generation for PVNet training',
+        'source': 'Mendeley DOI: 10.17632/y58jknpgs8.2 (Grid-India/POSOCO)',
+        'schema': 'ocf-data-sampler generation format',
+        'time_resolution': '1 hour',
+        'location': 'All India aggregate',
+        'date_range': f"{hourly['time_utc'].min()} to {hourly['time_utc'].max()}",
+        'created': datetime.now().isoformat()
+    }
+    
+    zarr_path = PROCESSED_DIR / "india_solar_2024-2025.zarr"
+    ds.to_zarr(str(zarr_path), mode='w', consolidated=True)
+    logger.info(f"Saved Zarr: {zarr_path}")
+    
+    # Verify the schema
+    logger.info("\n=== Zarr Schema Verification ===")
+    logger.info(f"Dimensions: {dict(ds.dims)}")
+    logger.info("Coordinates:")
+    for coord in ds.coords:
+        logger.info(f"  {coord}: {ds.coords[coord].dtype}")
+    logger.info("Data Variables:")
+    for var in ds.data_vars:
+        logger.info(f"  {var}: {ds.data_vars[var].dtype}, dims {ds.data_vars[var].dims}")
+    
+    # Print summary stats
+    logger.info("\n" + "="*60)
+    logger.info("DATA SUMMARY")
+    logger.info("="*60)
+    for col in ['solar_generation_mw', 'wind_generation_mw', 'demand_mw']:
+        if col in hourly.columns:
+            data = hourly[col].dropna()
+            logger.info(f"\n{col}:")
+            logger.info(f"  Count: {len(data)}")
+            logger.info(f"  Min: {data.min():.2f} MW")
+            logger.info(f"  Max: {data.max():.2f} MW")
+            logger.info(f"  Mean: {data.mean():.2f} MW")
+            logger.info(f"  Missing: {hourly[col].isna().sum()} ({hourly[col].isna().mean()*100:.1f}%)")
+    
+    logger.info("\n✅ Processing complete!")
+    logger.info("\nNOTE: Only Jan 2024-Jun 2025 data processed. SCADA-coded files (2021-2023) need column mapping.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/open_data_pvnet/scripts/test_india_pipeline.py b/src/open_data_pvnet/scripts/test_india_pipeline.py
new file mode 100644
index 0000000..aa37e47
--- /dev/null
+++ b/src/open_data_pvnet/scripts/test_india_pipeline.py
@@ -0,0 +1,112 @@
+"""
+Test India PVNet Data Pipeline
+
+Unit tests for India solar data pipeline components.
+Uses mocked data to run in CI without requiring local datasets or S3 access.
+"""
+
+import numpy as np
+import pandas as pd
+import pytest
+import xarray as xr
+
+
+def _create_mock_india_solar_dataset() -> xr.Dataset:
+    """Create a mock India solar generation dataset matching OCF schema."""
+    n_times = 100
+    n_locations = 5
+
+    times = pd.date_range("2024-01-01", periods=n_times, freq="h")
+    location_ids = list(range(n_locations))
+
+    # Create solar-like generation pattern (zero at night, peak at noon)
+    hours = times.hour
+    solar_pattern = np.maximum(0, np.sin((hours - 6) * np.pi / 12))
+    generation = np.outer(solar_pattern, np.random.uniform(5000, 15000, n_locations))
+    generation = generation.astype(np.float32)
+
+    ds = xr.Dataset(
+        {
+            "generation_mw": (["time_utc", "location_id"], generation),
+            "capacity_mwp": (["location_id"], np.array([20000.0] * n_locations, dtype=np.float32)),
+        },
+        coords={
+            "time_utc": times,
+            "location_id": location_ids,
+            "longitude": ("location_id", [77.0, 72.8, 80.2, 88.3, 91.7]),
+            "latitude": ("location_id", [28.6, 19.0, 13.0, 22.5, 26.1]),
+        },
+    )
+    return ds
+
+
+def test_mock_india_solar_schema():
+    """Verify India solar dataset matches expected OCF schema."""
+    ds = _create_mock_india_solar_dataset()
+
+    # Check required variables
+    assert "generation_mw" in ds.data_vars
+    assert "capacity_mwp" in ds.data_vars
+
+    # Check required coordinates
+    assert "time_utc" in ds.coords
+    assert "location_id" in ds.coords
+    assert "longitude" in ds.coords
+    assert "latitude" in ds.coords
+
+    # Check dimensions
+    assert set(ds["generation_mw"].dims) == {"time_utc", "location_id"}
+
+
+def test_india_solar_data_types():
+    """Verify data types are correct."""
+    ds = _create_mock_india_solar_dataset()
+
+    assert ds["generation_mw"].dtype == np.float32
+    assert ds["capacity_mwp"].dtype == np.float32
+
+
+def test_india_solar_values_reasonable():
+    """Verify solar generation values are physically plausible."""
+    ds = _create_mock_india_solar_dataset()
+
+    gen = ds["generation_mw"].values
+    assert np.all(gen >= 0), "Solar generation should be non-negative"
+    assert np.any(gen > 0), "Should have some positive generation"
+    assert np.all(gen < 100_000), "Generation should be below 100 GW"
+
+
+def test_india_solar_time_range():
+    """Verify time range is within expected India solar data bounds."""
+    ds = _create_mock_india_solar_dataset()
+
+    times = pd.DatetimeIndex(ds["time_utc"].values)
+    assert times.min() >= pd.Timestamp("2024-01-01")
+    assert len(times) > 0
+
+
+def test_india_solar_coordinates_in_bounds():
+    """Verify coordinates fall within India bounding box."""
+    ds = _create_mock_india_solar_dataset()
+
+    lats = ds["latitude"].values
+    lons = ds["longitude"].values
+
+    # India: ~6-38°N, ~68-98°E
+    assert np.all(lats >= 5), f"Latitude {lats.min()} below India bounds"
+    assert np.all(lats <= 39), f"Latitude {lats.max()} above India bounds"
+    assert np.all(lons >= 67), f"Longitude {lons.min()} below India bounds"
+    assert np.all(lons <= 99), f"Longitude {lons.max()} above India bounds"
+
+
+def test_india_solar_diurnal_pattern():
+    """Verify solar generation shows expected diurnal pattern (zero at night)."""
+    ds = _create_mock_india_solar_dataset()
+
+    # Night hours (0-5 UTC ~ 5:30-10:30 IST) should have zero/low generation
+    night_mask = ds["time_utc"].dt.hour < 6
+    night_gen = ds["generation_mw"].where(night_mask, drop=True)
+
+    if len(night_gen.time_utc) > 0:
+        assert float(night_gen.mean()) < float(ds["generation_mw"].mean()), \
+            "Night generation should be lower than average"
diff --git a/src/open_data_pvnet/scripts/train_india_baseline.py b/src/open_data_pvnet/scripts/train_india_baseline.py
new file mode 100644
index 0000000..27afcf7
--- /dev/null
+++ b/src/open_data_pvnet/scripts/train_india_baseline.py
@@ -0,0 +1,186 @@
+"""
+India PVNet Training Script - Solar-Only Baseline
+
+This script trains a simple forecast model on India solar data.
+Without NWP data (OCF GFS is UK-only), we use a solar-only approach:
+- Historical solar generation patterns
+- Solar position (time-based features)
+- Day-of-week/month seasonality
+
+For full PVNet with NWP, India-specific GFS data needs to be processed from NOAA.
+"""
+
+import pandas as pd
+import numpy as np
+import xarray as xr
+from pathlib import Path
+from datetime import datetime
+import logging
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+BASE_DIR = Path(r"C:\Users\asus vivoBook\Desktop\New folder (2)\pvnet-india-data")
+PROCESSED_DIR = BASE_DIR / "processed"
+
+
+def load_india_data() -> pd.DataFrame:
+    """Load India solar data from Zarr."""
+    zarr_path = PROCESSED_DIR / "india_solar_2024-2025.zarr"
+    ds = xr.open_zarr(str(zarr_path))
+    
+    df = ds.to_dataframe().reset_index()
+    df = df.dropna(subset=['solar_generation_mw'])
+    
+    logger.info(f"Loaded {len(df)} rows of India solar data")
+    return df
+
+
+def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Add time-based features for solar prediction."""
+    df = df.copy()
+    
+    # Ensure datetime is proper type
+    df['datetime'] = pd.to_datetime(df['datetime_gmt'])
+    
+    # Time features
+    df['hour'] = df['datetime'].dt.hour
+    df['day_of_week'] = df['datetime'].dt.dayofweek
+    df['month'] = df['datetime'].dt.month
+    df['day_of_year'] = df['datetime'].dt.dayofyear
+    
+    # Solar position approximation (simplified)
+    # Peak solar at ~12-13 IST (6:30-7:30 UTC)
+    df['hours_from_noon_utc'] = abs(df['hour'] - 6.5)  # Approximate India peak
+    
+    # Sine/cosine encoding for cyclic features
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    
+    # Is daytime (approximate for India)
+    df['is_daytime'] = ((df['hour'] >= 1) & (df['hour'] <= 12)).astype(int)
+    
+    return df
+
+
+def create_lag_features(df: pd.DataFrame, target_col: str, lags: list) -> pd.DataFrame:
+    """Create lagged features for time series."""
+    df = df.copy().sort_values('datetime')
+    
+    for lag in lags:
+        df[f'{target_col}_lag_{lag}h'] = df[target_col].shift(lag)
+    
+    # Also add rolling averages
+    for window in [3, 6, 12, 24]:
+        df[f'{target_col}_roll_{window}h'] = df[target_col].rolling(window, min_periods=1).mean()
+    
+    return df
+
+
+def train_baseline_model(df: pd.DataFrame):
+    """Train a simple gradient boosting model as baseline."""
+    logger.info("Training baseline model...")
+    
+    # Feature columns
+    feature_cols = [
+        'hour', 'hour_sin', 'hour_cos',
+        'month', 'month_sin', 'month_cos',
+        'day_of_week', 'day_of_year',
+        'hours_from_noon_utc', 'is_daytime',
+        'solar_generation_mw_lag_1h',
+        'solar_generation_mw_lag_24h',
+        'solar_generation_mw_roll_3h',
+        'solar_generation_mw_roll_24h',
+    ]
+    
+    # Filter valid rows
+    df_train = df.dropna(subset=feature_cols + ['solar_generation_mw'])
+    
+    X = df_train[feature_cols]
+    y = df_train['solar_generation_mw']
+    
+    # Split
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, shuffle=False  # Time series: no shuffle
+    )
+    
+    logger.info(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
+    
+    try:
+        from sklearn.ensemble import GradientBoostingRegressor
+        
+        model = GradientBoostingRegressor(
+            n_estimators=100,
+            max_depth=5,
+            learning_rate=0.1,
+            random_state=42
+        )
+        model.fit(X_train, y_train)
+        
+        # Evaluate
+        y_pred = model.predict(X_test)
+        mae = mean_absolute_error(y_test, y_pred)
+        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+        
+        logger.info(f"\n{'='*60}")
+        logger.info("BASELINE MODEL RESULTS")
+        logger.info(f"{'='*60}")
+        logger.info(f"MAE: {mae:.2f} MW")
+        logger.info(f"RMSE: {rmse:.2f} MW")
+        logger.info(f"Mean Solar: {y_test.mean():.2f} MW")
+        logger.info(f"MAE/Mean: {mae/y_test.mean()*100:.1f}%")
+        
+        # Feature importance
+        logger.info("\nTop Feature Importances:")
+        importance = pd.DataFrame({
+            'feature': feature_cols,
+            'importance': model.feature_importances_
+        }).sort_values('importance', ascending=False)
+        
+        for _, row in importance.head(5).iterrows():
+            logger.info(f"  {row['feature']}: {row['importance']:.3f}")
+        
+        return model, mae, rmse
+        
+    except Exception as e:
+        logger.error(f"Training failed: {e}")
+        return None, None, None
+
+
+def main():
+    logger.info("="*60)
+    logger.info("INDIA PVNET - BASELINE TRAINING")
+    logger.info("="*60)
+    logger.info("Note: Using solar-only approach (no NWP data)")
+    logger.info("")
+    
+    # Load data
+    df = load_india_data()
+    
+    # Add features
+    df = add_temporal_features(df)
+    df = create_lag_features(df, 'solar_generation_mw', lags=[1, 2, 3, 6, 12, 24])
+    
+    logger.info(f"Features created. Shape: {df.shape}")
+    
+    # Train
+    model, mae, rmse = train_baseline_model(df)
+    
+    if model is not None:
+        logger.info("\n" + "="*60)
+        logger.info("✅ Baseline training complete!")
+        logger.info("="*60)
+        logger.info("\nNext Steps:")
+        logger.info("1. Add NWP data (needs NOAA GFS processing for India)")
+        logger.info("2. Integrate with full PVNet model architecture")
+        logger.info("3. Compare with persistence baseline")
+    else:
+        logger.error("❌ Training failed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/nwp/test_gfs.py b/tests/nwp/test_gfs.py
new file mode 100644
index 0000000..331a875
--- /dev/null
+++ b/tests/nwp/test_gfs.py
@@ -0,0 +1,112 @@
+"""Unit tests for GFS NWP data processing module."""
+
+import pytest
+from unittest.mock import patch
+
+
+class TestProcessGfsData:
+    """Tests for the process_gfs_data function."""
+
+    def test_unsupported_region_raises(self):
+        """Unsupported regions should raise ValueError."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        with pytest.raises(ValueError, match="Unsupported region"):
+            process_gfs_data(year=2024, month=1, region="brazil")
+
+    def test_unsupported_region_message(self):
+        """Error message should include the bad region name."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        with pytest.raises(ValueError, match="brazil"):
+            process_gfs_data(year=2024, month=1, region="brazil")
+
+    @patch("open_data_pvnet.scripts.download_gfs_india.process_month")
+    def test_india_region_calls_process_month(self, mock_process_month):
+        """India region should call process_month with correct args."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        mock_process_month.return_value = "/tmp/gfs_india/2024-01.zarr"
+
+        result = process_gfs_data(year=2024, month=1, region="india")
+
+        mock_process_month.assert_called_once_with(
+            year=2024,
+            month=1,
+            output_dir="data/gfs_india",
+            max_days=None,
+        )
+        assert result == "/tmp/gfs_india/2024-01.zarr"
+
+    @patch("open_data_pvnet.scripts.download_gfs_india.process_month")
+    def test_uk_region_accepted(self, mock_process_month):
+        """UK region should be accepted without error."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        mock_process_month.return_value = "/tmp/gfs_uk/2024-06.zarr"
+
+        result = process_gfs_data(year=2024, month=6, region="uk")
+
+        mock_process_month.assert_called_once_with(
+            year=2024,
+            month=6,
+            output_dir="data/gfs_uk",
+            max_days=None,
+        )
+        assert result == "/tmp/gfs_uk/2024-06.zarr"
+
+    @patch("open_data_pvnet.scripts.download_gfs_india.process_month")
+    def test_custom_output_dir(self, mock_process_month):
+        """Custom output_dir should be passed through."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        mock_process_month.return_value = "/custom/path/2024-03.zarr"
+
+        process_gfs_data(
+            year=2024, month=3, region="india", output_dir="/custom/path"
+        )
+
+        mock_process_month.assert_called_once_with(
+            year=2024,
+            month=3,
+            output_dir="/custom/path",
+            max_days=None,
+        )
+
+    @patch("open_data_pvnet.scripts.download_gfs_india.process_month")
+    def test_max_days_passed(self, mock_process_month):
+        """max_days should be forwarded to process_month."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        mock_process_month.return_value = "/tmp/out.zarr"
+
+        process_gfs_data(year=2024, month=1, region="india", max_days=5)
+
+        mock_process_month.assert_called_once_with(
+            year=2024,
+            month=1,
+            output_dir="data/gfs_india",
+            max_days=5,
+        )
+
+    @patch("open_data_pvnet.scripts.download_gfs_india.process_month")
+    def test_none_result_raises_runtime_error(self, mock_process_month):
+        """If process_month returns None, should raise RuntimeError."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        mock_process_month.return_value = None
+
+        with pytest.raises(RuntimeError, match="No GFS data processed"):
+            process_gfs_data(year=2024, month=1, region="india")
+
+    @patch("open_data_pvnet.scripts.download_gfs_india.process_month")
+    def test_default_output_dir_india(self, mock_process_month):
+        """Default output dir for india should be data/gfs_india."""
+        from open_data_pvnet.nwp.gfs import process_gfs_data
+
+        mock_process_month.return_value = "/tmp/out.zarr"
+
+        process_gfs_data(year=2024, month=1)  # default region="india"
+
+        call_kwargs = mock_process_month.call_args[1]
+        assert call_kwargs["output_dir"] == "data/gfs_india"