SURF-ML · Cryptheon · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,35 @@
+name: MLOps Pipeline
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4 # Updated to v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Lint Check
+        run: |
+          pip install ruff==0.1.14
+          ruff check .
+          ruff format --check .
+
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install Dependencies
+        run: |
+          pip install --upgrade pip
+          # CPU-only saves space and prevents OOM errors in CI
+          pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install pytest -e .
+      - name: Run Unit Tests
+        run: pytest tests/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.6
+    hooks:
+      # Run the linter (check for bugs)
+      - id: ruff
+        args: [ --fix ]
+      # Run the formatter (make it pretty)
+      - id: ruff-format
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,21 +2,50 @@
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
+# --- PROJECT METADATA & DEPENDENCIES ---
 [project]
 name = "ml_core"
 version = "0.1.0"
+description = "My MLOps project for SURF / UvA"
+readme = "README.md"
+requires-python = ">=3.10"
 dependencies = [
-    # TODO: Migrate dependencies from requirements.txt here
+    "numpy",
+    "h5py",
+    "matplotlib",
+    "seaborn",
+    "pandas",
+    "torch",
+    "torchvision",
+    "torcheval",
+    "tensorboard",
+    "umap-learn",
+    "tqdm",
+    "pyyaml",
+    "python-dotenv",
+    # Dev tools
+    "pytest",
+    "ruff",
+    "pre-commit",
 ]
 
+# --- TOOL CONFIGURATION ---
 [tool.ruff]
-line-length = 88
+# Exclude a variety of commonly ignored directories.
 exclude = [".git", "__pycache__", "venv"]
+# Same as Black.
+line-length = 88
+indent-width = 4
 
 [tool.ruff.lint]
-#"I" handles import sorting, "B" finds likely bugs
+# E: pycodestyle errors
+# F: Pyflakes
+# I: isort (Import Sorting - MLOps critical!)
+# B: flake8-bugbear (Likely bugs)
 select = ["E", "F", "I", "B"]
-ignore = ["E501"] # Ignore long lines, we let the formatter handle it
+# B905: no strict=True needed for zip
+ignore = ["E501", "B905", "E741"]
 
-[tool.pytest.ini_options]
-testpaths = ["tests"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
diff --git a/scripts/example_training_loop/training_loop.py b/scripts/example_training_loop/training_loop.py
@@ -0,0 +1,71 @@
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from ml_core.data import get_dataloaders
+from ml_core.models import MLP
+
+# 1. Setup Configuration
+config = {
+    "data": {"data_path": "../data/pcam/", "batch_size": 32, "num_workers": 2},
+    "model": {"input_shape": [3, 96, 96], "hidden_units": [64, 32], "num_classes": 2},
+}
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Training on: {device}")
+
+# 2. Initialize Data, Model, Optimizer
+train_loader, val_loader = get_dataloaders(config)
+model = MLP(**config["model"]).to(device)
+optimizer = optim.SGD(model.parameters(), lr=0.001)
+criterion = nn.CrossEntropyLoss()
+
+# 3. Training Loop
+train_losses = []
+val_losses = []
+
+for epoch in range(3):
+    model.train()
+    epoch_train_loss = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+        images, labels = images.to(device), labels.to(device)
+
+        optimizer.zero_grad()
+        outputs = model(images)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        epoch_train_loss += loss.item()
+
+        if i % 100 == 0:  # Log-after-n-steps granularity
+            print(f"Epoch {epoch+1}, Step {i}, Loss: {loss.item():.4f}")
+
+    train_losses.append(epoch_train_loss / len(train_loader))
+
+    # Validation
+    model.eval()
+    epoch_val_loss = 0
+    with torch.no_grad():
+        for images, labels in val_loader:
+            images, labels = images.to(device), labels.to(device)
+            outputs = model(images)
+            loss = criterion(outputs, labels)
+            epoch_val_loss += loss.item()
+
+    val_losses.append(epoch_val_loss / len(val_loader))
+    print(
+        f"--- Epoch {epoch+1} Summary: Train Loss {train_losses[-1]:.4f}, Val Loss {val_losses[-1]:.4f} ---"
+    )
+
+plt.figure(figsize=(10, 5))
+plt.plot(range(1, 4), train_losses, label="Train Loss", marker="o")
+plt.plot(range(1, 4), val_losses, label="Val Loss", marker="o")
+plt.title("PCAM Training: First 3 Epochs")
+plt.xlabel("Epoch")
+plt.ylabel("CrossEntropy Loss")
+plt.legend()
+plt.grid(True)
+plt.savefig("pcam_learning_curves.png")
+print("Training complete. Plot saved as pcam_learning_curves.png")
diff --git a/src/ml_core/data/loader.py b/src/ml_core/data/loader.py
@@ -1,30 +1,41 @@
 from pathlib import Path
 from typing import Dict, Tuple
 
-from torch.utils.data import DataLoader
+import numpy as np
+from torch.utils.data import DataLoader, WeightedRandomSampler
 from torchvision import transforms
 
 from .pcam import PCAMDataset
 
 
 def get_dataloaders(config: Dict) -> Tuple[DataLoader, DataLoader]:
-    """
-    Factory function to create Train and Validation DataLoaders
-    using pre-split H5 files.
-    """
     data_cfg = config["data"]
     base_path = Path(data_cfg["data_path"])
 
-    # TODO: Define Transforms
-    # train_transform = ...
-    # val_transform = ...
+    def create_loader(split: str, use_sampler: bool = False):
+        x_p = str(base_path / f"camelyonpatch_level_2_split_{split}_x.h5")
+        y_p = str(base_path / f"camelyonpatch_level_2_split_{split}_y.h5")
 
-    # TODO: Define Paths for X and Y (train and val)
-
-    # TODO: Instantiate PCAMDataset for train and val
+        # Using ToTensor handles the (C, H, W) conversion and scaling to [0, 1]
+        ds = PCAMDataset(x_p, y_p, transform=transforms.ToTensor())
 
-    # TODO: Create DataLoaders
-    # train_loader = ...
-    # val_loader = ...
-
-    raise NotImplementedError("Implement get_dataloaders")
+        sampler = None
+        if use_sampler:
+            # Flatten labels for weight calculation
+            labels = ds.y_data[:].flatten()
+            class_counts = np.bincount(labels)
+            weights = 1.0 / class_counts[labels]
+            sampler = WeightedRandomSampler(weights, len(weights))
+
+        return DataLoader(
+            ds,
+            batch_size=data_cfg["batch_size"],
+            sampler=sampler,
+            num_workers=data_cfg.get("num_workers", 0),
+            shuffle=(sampler is None),  # Shuffle only if not using sampler
+        )
+
+    train_loader = create_loader("train", use_sampler=True)
+    val_loader = create_loader("valid", use_sampler=False)
+
+    return train_loader, val_loader
diff --git a/src/ml_core/data/pcam.py b/src/ml_core/data/pcam.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import Callable, Optional, Tuple
 
 import h5py
@@ -8,30 +7,48 @@
 
 
 class PCAMDataset(Dataset):
-    """
-    PatchCamelyon (PCAM) Dataset reader for H5 format.
-    """
-
-    def __init__(self, x_path: str, y_path: str, transform: Optional[Callable] = None):
-        self.x_path = Path(x_path)
-        self.y_path = Path(y_path)
+    def __init__(
+        self,
+        x_path: str,
+        y_path: str,
+        transform: Optional[Callable] = None,
+        filter_data: bool = False,
+    ):
+        self.x_data = h5py.File(x_path, "r")["x"]
+        self.y_data = h5py.File(y_path, "r")["y"]
         self.transform = transform
 
-        # TODO: Initialize dataset
-        # 1. Check if files exist
-        # 2. Open h5 files in read mode
-        pass
+        # Initialize indices for filtering
+        self.indices = np.arange(len(self.x_data))
+
+        if filter_data:
+            valid_indices = []
+            for i in range(len(self.x_data)):
+                # Heuristic: Drop blackouts (0) and washouts (255)
+                mean_val = np.mean(self.x_data[i])
+                if 0 < mean_val < 255:
+                    valid_indices.append(i)
+            self.indices = np.array(valid_indices)
 
     def __len__(self) -> int:
-        # TODO: Return length of dataset
-        # The dataloader will know hence how many batches to create
-        return 0
+        return len(self.indices)
 
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        # TODO: Implement data retrieval
-        # 1. Read data at idx
-        # 2. Convert to uint8 (for PIL compatibility if using transforms)
-        # 3. Apply transforms if they exist
-        # 4. Return tensor image and label (as long)
-
-        raise NotImplementedError("Implement __getitem__ in PCAMDataset")
+        real_idx = self.indices[idx]
+        img = self.x_data[real_idx]
+        label = self.y_data[real_idx].item()
+
+        # Handle NaNs explicitly before clipping/casting
+        # This replaces NaNs with 0.0 (black)
+        img = np.nan_to_num(img, nan=0.0)
+
+        # Numerical Stability: Clip before uint8 cast
+        img = np.clip(img, 0, 255).astype(np.uint8)
+
+        if self.transform:
+            img = self.transform(img)
+        else:
+            # Basic conversion if no transform provided
+            img = torch.from_numpy(img).permute(2, 0, 1).float()
+
+        return img, torch.tensor(label, dtype=torch.long)
diff --git a/src/ml_core/models/mlp.py b/src/ml_core/models/mlp.py
@@ -13,13 +13,23 @@ def __init__(
         dropout_rate: float = 0.2,
     ):
         super().__init__()
-
-        # TODO: Build the MLP architecture
-        # If you are up to the task, explore other architectures or model types
-        # Hint: Flatten -> [Linear -> ReLU -> Dropout] * N_layers -> Linear
-
-        pass
+
+        self.input_dim = input_shape[0] * input_shape[1] * input_shape[2]
+        layers = []
+        in_features = self.input_dim
+
+        for hidden in hidden_units:
+            layers.append(nn.Linear(in_features, hidden))
+            layers.append(nn.ReLU())
+            layers.append(nn.Dropout(dropout_rate))
+            in_features = hidden
+
+        # map to num_classes (e.g., 2 for binary because we are using CrossEntropyLoss)
+        layers.append(nn.Linear(in_features, num_classes))
+
+        self.network = nn.Sequential(*layers)
+        self.flatten = nn.Flatten()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # TODO: Implement forward pass
-        pass
+        x = self.flatten(x)
+        return self.network(x)