Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: MLOps Pipeline

on: [push, pull_request]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4 # Updated to v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Lint Check
run: |
pip install ruff==0.1.14
ruff check .
ruff format --check .

test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Dependencies
run: |
pip install --upgrade pip
# CPU-only saves space and prevents OOM errors in CI
pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install pytest -e .
- name: Run Unit Tests
run: pytest tests/
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
hooks:
# Run the linter (check for bugs)
- id: ruff
args: [ --fix ]
# Run the formatter (make it pretty)
- id: ruff-format
41 changes: 35 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,50 @@
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

# --- PROJECT METADATA & DEPENDENCIES ---
[project]
name = "ml_core"
version = "0.1.0"
description = "My MLOps project for SURF / UvA"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
# TODO: Migrate dependencies from requirements.txt here
"numpy",
"h5py",
"matplotlib",
"seaborn",
"pandas",
"torch",
"torchvision",
"torcheval",
"tensorboard",
"umap-learn",
"tqdm",
"pyyaml",
"python-dotenv",
# Dev tools
"pytest",
"ruff",
"pre-commit",
]

# --- TOOL CONFIGURATION ---
[tool.ruff]
line-length = 88
# Exclude a variety of commonly ignored directories.
exclude = [".git", "__pycache__", "venv"]
# Same as Black.
line-length = 88
indent-width = 4

[tool.ruff.lint]
#"I" handles import sorting, "B" finds likely bugs
# E: pycodestyle errors
# F: Pyflakes
# I: isort (Import Sorting - MLOps critical!)
# B: flake8-bugbear (Likely bugs)
select = ["E", "F", "I", "B"]
ignore = ["E501"] # Ignore long lines, we let the formatter handle it
# B905: no strict=True needed for zip
ignore = ["E501", "B905", "E741"]

[tool.pytest.ini_options]
testpaths = ["tests"]
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
71 changes: 71 additions & 0 deletions scripts/example_training_loop/training_loop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from ml_core.data import get_dataloaders
from ml_core.models import MLP

# 1. Setup Configuration
config = {
"data": {"data_path": "../data/pcam/", "batch_size": 32, "num_workers": 2},
"model": {"input_shape": [3, 96, 96], "hidden_units": [64, 32], "num_classes": 2},
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on: {device}")

# 2. Initialize Data, Model, Optimizer
train_loader, val_loader = get_dataloaders(config)
model = MLP(**config["model"]).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# 3. Training Loop
train_losses = []
val_losses = []

for epoch in range(3):
model.train()
epoch_train_loss = 0

for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(device), labels.to(device)

optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

epoch_train_loss += loss.item()

if i % 100 == 0: # Log-after-n-steps granularity
print(f"Epoch {epoch+1}, Step {i}, Loss: {loss.item():.4f}")

train_losses.append(epoch_train_loss / len(train_loader))

# Validation
model.eval()
epoch_val_loss = 0
with torch.no_grad():
for images, labels in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
epoch_val_loss += loss.item()

val_losses.append(epoch_val_loss / len(val_loader))
print(
f"--- Epoch {epoch+1} Summary: Train Loss {train_losses[-1]:.4f}, Val Loss {val_losses[-1]:.4f} ---"
)

plt.figure(figsize=(10, 5))
plt.plot(range(1, 4), train_losses, label="Train Loss", marker="o")
plt.plot(range(1, 4), val_losses, label="Val Loss", marker="o")
plt.title("PCAM Training: First 3 Epochs")
plt.xlabel("Epoch")
plt.ylabel("CrossEntropy Loss")
plt.legend()
plt.grid(True)
plt.savefig("pcam_learning_curves.png")
print("Training complete. Plot saved as pcam_learning_curves.png")
43 changes: 27 additions & 16 deletions src/ml_core/data/loader.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,41 @@
from pathlib import Path
from typing import Dict, Tuple

from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import transforms

from .pcam import PCAMDataset


def get_dataloaders(config: Dict) -> Tuple[DataLoader, DataLoader]:
"""
Factory function to create Train and Validation DataLoaders
using pre-split H5 files.
"""
data_cfg = config["data"]
base_path = Path(data_cfg["data_path"])

# TODO: Define Transforms
# train_transform = ...
# val_transform = ...
def create_loader(split: str, use_sampler: bool = False):
x_p = str(base_path / f"camelyonpatch_level_2_split_{split}_x.h5")
y_p = str(base_path / f"camelyonpatch_level_2_split_{split}_y.h5")

# TODO: Define Paths for X and Y (train and val)

# TODO: Instantiate PCAMDataset for train and val
# Using ToTensor handles the (C, H, W) conversion and scaling to [0, 1]
ds = PCAMDataset(x_p, y_p, transform=transforms.ToTensor())

# TODO: Create DataLoaders
# train_loader = ...
# val_loader = ...

raise NotImplementedError("Implement get_dataloaders")
sampler = None
if use_sampler:
# Flatten labels for weight calculation
labels = ds.y_data[:].flatten()
class_counts = np.bincount(labels)
weights = 1.0 / class_counts[labels]
sampler = WeightedRandomSampler(weights, len(weights))

return DataLoader(
ds,
batch_size=data_cfg["batch_size"],
sampler=sampler,
num_workers=data_cfg.get("num_workers", 0),
shuffle=(sampler is None), # Shuffle only if not using sampler
)

train_loader = create_loader("train", use_sampler=True)
val_loader = create_loader("valid", use_sampler=False)

return train_loader, val_loader
61 changes: 39 additions & 22 deletions src/ml_core/data/pcam.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from pathlib import Path
from typing import Callable, Optional, Tuple

import h5py
Expand All @@ -8,30 +7,48 @@


class PCAMDataset(Dataset):
"""
PatchCamelyon (PCAM) Dataset reader for H5 format.
"""

def __init__(self, x_path: str, y_path: str, transform: Optional[Callable] = None):
self.x_path = Path(x_path)
self.y_path = Path(y_path)
def __init__(
self,
x_path: str,
y_path: str,
transform: Optional[Callable] = None,
filter_data: bool = False,
):
self.x_data = h5py.File(x_path, "r")["x"]
self.y_data = h5py.File(y_path, "r")["y"]
self.transform = transform

# TODO: Initialize dataset
# 1. Check if files exist
# 2. Open h5 files in read mode
pass
# Initialize indices for filtering
self.indices = np.arange(len(self.x_data))

if filter_data:
valid_indices = []
for i in range(len(self.x_data)):
# Heuristic: Drop blackouts (0) and washouts (255)
mean_val = np.mean(self.x_data[i])
if 0 < mean_val < 255:
valid_indices.append(i)
self.indices = np.array(valid_indices)

def __len__(self) -> int:
# TODO: Return length of dataset
# The dataloader will know hence how many batches to create
return 0
return len(self.indices)

def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
# TODO: Implement data retrieval
# 1. Read data at idx
# 2. Convert to uint8 (for PIL compatibility if using transforms)
# 3. Apply transforms if they exist
# 4. Return tensor image and label (as long)

raise NotImplementedError("Implement __getitem__ in PCAMDataset")
real_idx = self.indices[idx]
img = self.x_data[real_idx]
label = self.y_data[real_idx].item()

# Handle NaNs explicitly before clipping/casting
# This replaces NaNs with 0.0 (black)
img = np.nan_to_num(img, nan=0.0)

# Numerical Stability: Clip before uint8 cast
img = np.clip(img, 0, 255).astype(np.uint8)

if self.transform:
img = self.transform(img)
else:
# Basic conversion if no transform provided
img = torch.from_numpy(img).permute(2, 0, 1).float()

return img, torch.tensor(label, dtype=torch.long)
26 changes: 18 additions & 8 deletions src/ml_core/models/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,23 @@ def __init__(
dropout_rate: float = 0.2,
):
super().__init__()

# TODO: Build the MLP architecture
# If you are up to the task, explore other architectures or model types
# Hint: Flatten -> [Linear -> ReLU -> Dropout] * N_layers -> Linear

pass

self.input_dim = input_shape[0] * input_shape[1] * input_shape[2]
layers = []
in_features = self.input_dim

for hidden in hidden_units:
layers.append(nn.Linear(in_features, hidden))
layers.append(nn.ReLU())
layers.append(nn.Dropout(dropout_rate))
in_features = hidden

# map to num_classes (e.g., 2 for binary because we are using CrossEntropyLoss)
layers.append(nn.Linear(in_features, num_classes))

self.network = nn.Sequential(*layers)
self.flatten = nn.Flatten()

def forward(self, x: torch.Tensor) -> torch.Tensor:
# TODO: Implement forward pass
pass
x = self.flatten(x)
return self.network(x)
Loading