Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a81df5a
enh - add Y transformation keyword argument in NormativeModel to enf…
contsili Feb 12, 2026
4e70e66
Merge branch 'dev' into log_positivity
contsili Feb 23, 2026
ef15def
Merge branch 'dev' into log_positivity
contsili Mar 2, 2026
1d2f536
enh - rename NormativeModel tests
contsili Mar 2, 2026
06f21a3
enh - generate data and model to use for the log transform test
contsili Mar 2, 2026
f79717e
enh - add log transform tests for centiles and predictions to ensure …
contsili Mar 2, 2026
2774ddf
enh - refactor log transformed centiles and yhat tests
contsili Mar 3, 2026
c791079
fix - discard test model fixtures
contsili Mar 3, 2026
fdc7371
enh - add blr model fixtures
contsili Mar 3, 2026
fce4ee3
fix - remove redundancies
contsili Mar 3, 2026
f8d735d
Merge branch 'dev' into log_positivity
contsili Mar 5, 2026
2f42207
fix - remove redundant import
contsili Mar 5, 2026
27201ab
enh - add log transform functionality
contsili Mar 5, 2026
d9e320a
Merge branch 'dev' into log_positivity
contsili Mar 5, 2026
87e4115
enh - rename
contsili Mar 5, 2026
80a442f
enh - rename
contsili Mar 5, 2026
4d31037
enh - refactor comments
contsili Mar 5, 2026
71d5f70
enh - add test fixtures for log transform
contsili Mar 5, 2026
dd702bd
enh - refactor existing log1p tests
contsili Mar 5, 2026
ba5844e
enh - add log transform tests
contsili Mar 5, 2026
e13a09a
enh - group tests for faster execution
contsili Mar 5, 2026
9b0ec3b
enh - check that the log transform can be performed
contsili Mar 5, 2026
235d13e
enh - plotter should assign positive response vars
contsili Mar 5, 2026
bc734f0
enh - last improvements to the tests
contsili Mar 5, 2026
60e5ea6
fix - clip before fitting
contsili Mar 5, 2026
1930194
enh - comment
contsili Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions pcntoolkit/normative_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ class NormativeModel:
Input (X/covariates) scaler to use.
outscaler: str
Output (Y/response_vars) scaler to use.
y_transform : str or None
Optional transform applied to Y before fitting and inverted
after prediction. Currently supported:
- ``"log1p"`` applies log(Y+1)
- ``"log"`` applies natural log(Y)
This is useful for phenotypes that cannot be negative.
Default is ``None`` (no transform).
name: str
Name of the model
"""
Expand All @@ -68,6 +75,7 @@ def __init__(
save_dir: Optional[str] = None,
inscaler: str = "standardize",
outscaler: str = "standardize",
y_transform: Optional[str] = None,
name: Optional[str] = None,
):
self.savemodel: bool = savemodel
Expand All @@ -77,6 +85,7 @@ def __init__(
self._save_dir = save_dir if save_dir is not None else get_default_save_dir()
self.inscaler: str = inscaler
self.outscaler: str = outscaler
self.y_transform: Optional[str] = y_transform
self.name: Optional[str] = name
self.response_vars: list[str] = None # type: ignore
self.template_regression_model: RegressionModel = template_regression_model
Expand Down Expand Up @@ -181,6 +190,7 @@ def transfer(self, transfer_data: NormData, save_dir: str | None = None, **kwarg
saveplots=True,
inscaler=self.inscaler,
outscaler=self.outscaler,
y_transform=self.y_transform,
save_dir=self.save_dir,
)
if save_dir is not None:
Expand Down Expand Up @@ -240,6 +250,7 @@ def extend(self, data: NormData, save_dir: str | None = None, n_synth_samples: i
saveplots=True,
inscaler=self.inscaler,
outscaler=self.outscaler,
y_transform=self.y_transform,
save_dir=save_dir,
)

Expand Down Expand Up @@ -460,6 +471,7 @@ def load(cls, path: str, into: NormativeModel | None = None) -> NormativeModel:
outscaler = metadata["outscaler"]
saveplots = metadata["saveplots"]
evaluate_model = metadata["evaluate_model"]
y_transform = metadata.get("y_transform", None)
name = metadata["name"]

response_vars = []
Expand Down Expand Up @@ -492,6 +504,7 @@ def load(cls, path: str, into: NormativeModel | None = None) -> NormativeModel:
save_dir=save_dir,
inscaler=inscaler,
outscaler=outscaler,
y_transform=y_transform,
name=name,
)
else:
Expand Down Expand Up @@ -549,9 +562,13 @@ def preprocess(self, data: NormData) -> None:
"""
Applies preprocessing transformations to the input data.

First applies an optional response transform (e.g. log1p), then scales.

Args:
data (NormData): Data to preprocess.
"""
# Enforce positivity if necessary
self._apply_y_transform(data)
self.scale_forward(data)

def scale_forward(self, data: NormData, overwrite: bool = False) -> None:
Expand Down Expand Up @@ -586,10 +603,15 @@ def scale_forward(self, data: NormData, overwrite: bool = False) -> None:
def postprocess(self, data: NormData) -> None:
"""Apply postprocessing to the data.

First unscales, then applies the inverse response transform (e.g. expm1).

Args:
data (NormData): Data to postprocess.
"""
self.scale_backward(data)
# Invert Y to its original space if positivity was enforced during
# preprocessing
self._invert_y_transform(data)

def scale_backward(self, data: NormData) -> None:
"""
Expand All @@ -606,6 +628,80 @@ def scale_backward(self, data: NormData) -> None:
"""
data.scale_backward(self.inscalers, self.outscalers)

def _apply_y_transform(self, data: NormData) -> None:
"""
Apply the forward response transform (e.g. log1p) to Y-like variables
in the data.

Parameters
----------
data : NormData
Data object containing response variable arrays (Y, Yhat,
centiles, thrive_Y) to which the transform should be applied.

"""
if self.y_transform is None:
return

# TODO: Check if we need to track if transform has already been
# applied to avoid double-inverting. Normally I dont expect any issues
# as every process() is followed by a postprocess(). The only issues can
# be if users call postprocess() multiple times manually or with
# compute_thrivelines() that has a preprocess() call without a postprocess().

if self.y_transform == "log1p":
# Apply log1p transform to the response variable Y
for var in ["Y"]:
if (data[var] < -1).any():
raise ValueError("Cannot apply log1p transform to variable "
f"'{var}' because it contains values less "
"than -1."
)
else:
data[var] = np.log1p(data[var])

elif self.y_transform == "log":
# Apply natural log transform to the response variable Y
for var in ["Y"]:
if (data[var] <= 0).any():
raise ValueError(
f"Cannot apply log transform to variable '{var}' "
"because it contains non-positive values. "
"Consider using 'log1p' transform or ensuring "
"all values are positive."
)
else:
data[var] = np.log(data[var])

def _invert_y_transform(self, data: NormData) -> None:
"""
Apply the inverse response transform (e.g. expm1) to Y-like variables
in the data.

Parameters
----------
data : NormData
Data object containing response variable arrays (Y, Yhat,
centiles, thrive_Y) to which the inverse transform should be applied.
"""
if self.y_transform is None:
return

# TODO: Check if we need to track if inverse transform has already been
# applied to avoid double-inverting. Normally I dont expect any issues
# as every process() is followed by a postprocess(). The only issues can
# be if users call postprocess() multiple times manually or with
# compute_thrivelines() that has a preprocess() call without a postprocess().

if self.y_transform == "log1p":
for var in ("Y", "centiles", "Yhat", "Y_harmonized", "thrive_Y"):
if var in data.data_vars:
data[var] = np.expm1(data[var])
elif self.y_transform == "log":
for var in ("Y", "centiles", "Yhat", "Y_harmonized", "thrive_Y"):
if var in data.data_vars:
data[var] = np.exp(data[var])

def evaluate(self, data: NormData) -> None:
"""
Evaluates the model performance on the data.
Expand Down Expand Up @@ -991,6 +1087,7 @@ def to_dict(self):
"is_fitted": self.is_fitted,
"inscaler": self.inscaler,
"outscaler": self.outscaler,
"y_transform": self.y_transform,
"ptk_version": importlib.metadata.version("pcntoolkit"),
}

Expand Down Expand Up @@ -1050,6 +1147,7 @@ def from_args(cls, **kwargs) -> NormativeModel:
inscaler = kwargs.get("inscaler", "none")
outscaler = kwargs.get("outscaler", "none")
name = kwargs.get("name", None)
y_transform = kwargs.get("y_transform", None)
assert "alg" in kwargs, "Algorithm must be specified"
if kwargs["alg"] == "blr":
template_regression_model = BLR.from_args("template", kwargs)
Expand All @@ -1068,6 +1166,7 @@ def from_args(cls, **kwargs) -> NormativeModel:
save_dir=save_dir,
inscaler=inscaler,
outscaler=outscaler,
y_transform=y_transform,
name=name,
)

Expand Down
1 change: 1 addition & 0 deletions pcntoolkit/regression_model/blr.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ def backward(self, X: xr.DataArray, be: xr.DataArray, Z: xr.DataArray) -> xr.Dat
self.ys[mask] = self.ys[mask] + residual_mean
self.s2[mask] = np.square(np.sqrt(self.s2[mask]) * correction_factor)

# Compute the centiles in the original Y space: centiles = Z * std + mean
centiles = np_Z * np.sqrt(self.s2) + self.ys
if self.warp:
centiles = self.warp.invf(centiles, self.gamma)
Expand Down
10 changes: 6 additions & 4 deletions pcntoolkit/util/plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ def plot_centiles(
# Batch effects are the first ones in the highlighted batch effects
for be, v in batch_effects.items():
centile_df[be] = v
# Response vars are all 0, we don't need them
# Assign random values for response vars because they are not needed.
# They must be > 0 to satisfy later checks that require response_vars > 0.
for rv in response_vars:
centile_df[rv] = 0
centile_df[rv] = 1e-6

centile_data = NormData.from_dataframe(
"centile",
Expand Down Expand Up @@ -352,9 +353,10 @@ def plot_centiles_advanced(
# Batch effects are the first ones in the highlighted batch effects
for be, v in batch_effects.items():
centile_df[be] = v[0]
# Response vars are all 0, we don't need them
# Assign random values for response vars because they are not needed.
# They must be > 0 to satisfy later checks that require response_vars > 0.
for rv in model.response_vars:
centile_df[rv] = 0
centile_df[rv] = 1e-6
centile_data = NormData.from_dataframe(
"centile",
dataframe=centile_df,
Expand Down
85 changes: 78 additions & 7 deletions test/fixtures/blr_model_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from math import log

import pytest
from typing import Any, Callable

Expand All @@ -11,6 +13,7 @@
from pcntoolkit.regression_model.blr import BLR
from test.fixtures.norm_data_fixtures import *
from test.fixtures.path_fixtures import *
import os

# Default keyword arguments shared by all BLR tests.
BLR_BASE_CONFIG: dict[str, Any] = {
Expand Down Expand Up @@ -84,7 +87,7 @@ def fitted_blr_model(


@pytest.fixture
def new_norm_blr_model(
def norm_blr_model(
blr_model_factory: Callable,
save_dir_blr
) -> NormativeModel:
Expand All @@ -102,12 +105,80 @@ def new_norm_blr_model(


@pytest.fixture
def fitted_norm_blr_model(new_norm_blr_model: NormativeModel,
def fitted_norm_blr_model(norm_blr_model: NormativeModel,
norm_data_from_arrays: NormData
) -> NormativeModel:
print("removing items")
if os.path.exists(new_norm_blr_model.save_dir):
shutil.rmtree(new_norm_blr_model.save_dir)
os.makedirs(new_norm_blr_model.save_dir, exist_ok=True)
new_norm_blr_model.fit(norm_data_from_arrays)
return new_norm_blr_model
if os.path.exists(norm_blr_model.save_dir):
shutil.rmtree(norm_blr_model.save_dir)
os.makedirs(norm_blr_model.save_dir, exist_ok=True)
norm_blr_model.fit(norm_data_from_arrays)
return norm_blr_model


@pytest.fixture
def log1p_transform_norm_blr_model(
save_dir_test_model: str
) -> NormativeModel:
"""Create a NormativeModel using BLR with log1p.

Returns
-------
NormativeModel
Un-fitted normative model with log1p transform.
"""
# Build a fresh save directory
log_dir = os.path.join(save_dir_test_model, "log1p")
if os.path.exists(log_dir):
shutil.rmtree(log_dir)
os.makedirs(log_dir, exist_ok=True)
# Create test regression model
blr_model = BLR("test_model_log1p")
# Return a NormativeModel with the log1p transform
return NormativeModel(
template_regression_model=blr_model,
savemodel=False,
saveresults=False,
evaluate_model=False,
saveplots=False,
save_dir=log_dir,
inscaler="standardize",
outscaler="standardize",
name="test_model_log1p",
y_transform="log1p",
)


@pytest.fixture
def log_transform_norm_blr_model(
save_dir_test_model: str,
) -> NormativeModel:
"""Create a NormativeModel using BLR with natural log y_transform.

Returns
-------
NormativeModel
Un-fitted normative model with natural-log transform.
"""
# Build a fresh save directory for this fixture
log_dir = os.path.join(save_dir_test_model, "log")
if os.path.exists(log_dir):
# Remove stale directory from previous test runs
shutil.rmtree(log_dir)
os.makedirs(log_dir, exist_ok=True)
# Create a BLR regression model for the natural-log transform test
blr_model = BLR("test_model_log")
# Return NormativeModel with natural-log transform enabled
return NormativeModel(
template_regression_model=blr_model,
savemodel=False,
saveresults=False,
evaluate_model=False,
saveplots=False,
save_dir=log_dir,
inscaler="standardize",
outscaler="standardize",
name="test_model_log",
y_transform="log",
)

3 changes: 3 additions & 0 deletions test/fixtures/test_model_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import shutil

import pytest

from pcntoolkit.dataio.norm_data import NormData
Expand Down
Loading
Loading