diff --git a/examples/composed-model.py b/examples/composed-model.py new file mode 100644 index 0000000..c5acc1e --- /dev/null +++ b/examples/composed-model.py @@ -0,0 +1,207 @@ +""" +Computing a Linear Model +======================== + +.. start-body + +In this tutorial we calculate a linear model using Ridge regression. +If you are never worked with equistore objects before please take a look at +the documentation. + +For constructing a linear Model we need the atomic descriptor as training data +``X`` as well as the energies and forces as target data ``y``. + +We first import all necessary packages. +""" + +import ase.io +import numpy as np +from equistore import Labels +from equistore.operations import ones_like, slice, sum_over_samples +from rascaline import SoapPowerSpectrum + +from equisolve.numpy.models.linear_model import Ridge +from equisolve.numpy.preprocessing import StandardScaler +from equisolve.numpy.compose import TransformedTargetRegressor, Pipeline + +from equisolve.utils.convert import ase_to_tensormap + + +# %% +# +# Dataset +# ------- +# +# As data set we use the SHIFTML set. You can obtain the dataset used in this +# example from our :download:`website<../../static/dataset.xyz>`. +# We read the first 20 structures of the data set using +# `ASE `. + + +frames = ase.io.read("dataset.xyz", ":20") + +# %% +# +# The data set contains everything we need for the model: +# The atomic positions we use for the descriptor and with this as +# training data. The data set also stores the energies and forces which will be our +# target data we regress against. +# +# Training data +# ------------- +# +# We construct the descriptor training data with a SOAP powerspectrum using +# rascaline. We first define the hyper parameters for the calculation + + +HYPER_PARAMETERS = { + "cutoff": 5.0, + "max_radial": 6, + "max_angular": 4, + "atomic_gaussian_width": 0.3, + "center_atom_weight": 1.0, + "radial_basis": { + "Gto": {}, + }, + "cutoff_function": { + "ShiftedCosine": {"width": 0.5}, + }, +} + +calculator = SoapPowerSpectrum(**HYPER_PARAMETERS) + +# %% +# +# And then run the actual calculation, including gradients with respect to positions. + +descriptor = calculator.compute(frames, gradients=["positions"]) + +# %% +# +# For more details on how the descriptor works see the documentation of +# rascaline. +# +# We now move all keys into properties to access them for our model. + +descriptor = descriptor.keys_to_samples("species_center") +descriptor = descriptor.keys_to_properties(["species_neighbor_1", "species_neighbor_2"]) + +# %% +# +# The descriptor contains a represenantion with respect to each central atoms per +# structure. However, our energies as target data is per structure only. +# Therefore, we sum the properties of each center atom per structure. + +X = sum_over_samples(descriptor, ["center", "species_center"]) + +# %% +# +# The newly defined :class:`equistore.TensorMap` contains a single block + +print(f"X contains {len(X.blocks())} block.") + +# %% +# +# As well as 1800 properties and 20 sample. +# +# We acces the data using the :meth:``equistore.TensorMap.block`` method + + +print(f"X contains {len(X.block().properties)} properties.") +print(f"X contains {len(X.block().samples)} samples.") + +# Target data +# ----------- +# +# We construct the target data by converting energies and forces into a +# :class:`equisolve.TensorMap`. + + +y = ase_to_tensormap(frames, energy="energy", forces="forces") + +# %% +# +# The target data y contains a single block + +print(y.block()) + +# %% +# +# Construct the model +# ------------------- +# +# Before we fit the model we have to define our regression values. +# +# For this we create a TensorMap containing with the desired regulerizer + +alpha = ones_like(X) +alpha.block().values[:] *= 1e-5 + +# %% +# +# So far ``alpha`` contains the same number of samples as ``X``. However, +# the regulerizer only has to be one sample, because all samples will be +# regulerized in the same way in a linear model. +# +# We remove all sample except the 0th one by using the +# :func:`equistore.operations.slice`. + +samples = Labels( + names=["structure"], + values=np.array([(0,)]), +) + +alpha = slice(alpha, samples=samples) + +# %% +# +# In our regulerizer we use the same values for all properties. However, +# :class:`equisolve.numpy.models.linear_model.Ridge` can also handle different +# regularization for each property. You can apply a property wise regularization by +# setting ``"values"`` of ``alpha_dict`` with an 1d array of the same length as the +# number of properties in the training data X (here 7200) +# +# With a valid regulerizer object we now initilize the Ridge object. +# ``parameter_keys`` determines with respect to which parameters the regression is +# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and +# ``"positions"`` (forces). + +parameter_keys = ["values", "positions"] +ridge = Ridge(parameter_keys=parameter_keys, alpha=alpha) +standardizer = StandardScaler(parameter_keys=parameter_keys) +ttr = TransformedTargetRegressor(regressor=ridge, transformer=standardizer +clf = Pipeline([('scaler', StandardScaler(parameter_keys=["values", "positions"])), + ('ridge', ttr)]) + +# Old classifier +#clf = Ridge(parameter_keys=parameter_keys, alpha=alpha) + + +# %% +# +# Next we create a sample weighting :class:`equistiore.TensorMap` that weights energies +# five times more then the forces. + +sw = ones_like(y) +sw.block().values[:] *= 5 + +# %% +# +# The function `equisolve.utils.dictionary_to_tensormap` create a +# :class:`equistore.TensorMap` with the same shape as our target data ``y`` but with +# values a defined by ``sw_dict``. + +print(sw) + +# Finally we can fit the model using the sample weights defined above. + +clf.fit(X, y) +#clf.fit(X, y, sample_weight=sw) TODO, passing args in pipes ant ttr is not implemented yet + + +# Finally we can predict values and calculate the root mean squre error +# of our model. + +clf.predict(X) +print(f"RMSE energies = {clf.score(X, y, parameter_key='values')[0]:.3f} eV") +print(f"RMSE forces = {clf.score(X, y, parameter_key='positions')[0]:.3f} eV/Å") diff --git a/src/equisolve/numpy/__init__.py b/src/equisolve/numpy/__init__.py index b166d39..48ee1f3 100644 --- a/src/equisolve/numpy/__init__.py +++ b/src/equisolve/numpy/__init__.py @@ -9,4 +9,4 @@ from . import models, utils -__all__ = ["models", "utils", "preprocessing"] +__all__ = ["models", "utils", "preprocessing", "compose"] diff --git a/src/equisolve/numpy/compose/__init__.py b/src/equisolve/numpy/compose/__init__.py new file mode 100644 index 0000000..f49b10d --- /dev/null +++ b/src/equisolve/numpy/compose/__init__.py @@ -0,0 +1,12 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# +# Copyright (c) 2023 Authors and contributors +# (see the AUTHORS.rst file for the full list of names) +# +# Released under the BSD 3-Clause "New" or "Revised" License +# SPDX-License-Identifier: BSD-3-Clause + +from ._base import Pipeline, TransformedTargetRegressor + + +__all__ = ["Pipeline", "TransformedTargetRegressor"] diff --git a/src/equisolve/numpy/compose/_base.py b/src/equisolve/numpy/compose/_base.py new file mode 100644 index 0000000..f99d381 --- /dev/null +++ b/src/equisolve/numpy/compose/_base.py @@ -0,0 +1,106 @@ +# -*- Mode: python3; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# +# Copyright (c) 2023 Authors and contributors +# (see the AUTHORS.rst file for the full list of names) +# +# Released under the BSD 3-Clause "New" or "Revised" License +# SPDX-License-Identifier: BSD-3-Clause + +from copy import deepcopy + +from typing import List, Optional, Union +from equistore import TensorMap + +from ...utils.metrics import rmse + +class TransformedTargetRegressor: + r"""Regressor that transforms the target values with a transformer before fitting + and inverse transforms the predicted target before outputting the score. + This is useful for doing a regression on a numerical stable scale while still outputting the scores in the original scale. + + :param regressor: The regressor used for doing the prediction. + + :param transformer: The transformer applied on the target y values before regression. + """ + + def __init__(self, regressor=None, transformer=None): + self.regressor = regressor + self.transformer = transformer + + if set(regressor.parameter_keys) != set(transformer.parameter_keys): + raise ValueError("parameter_keys between regressor and transformer do not match") + + if regressor is not None: + if hasattr(regressor, "fit") and not(callable(regressor.fit)): + raise ValueError("regressor does not have fit function") + # TODO more checks for functionalities + + # COMMENT I am not sure if we should check the regressor and transformer this way, + # a base class with defined functionalities seems more clean + + + # COMMENT scikit-learn offers also + #self.func + #self.inverse_func + #self.check_inverse + # but we do not need them at the moment, make low-prio issue to not forget? + + def fit(self, X: TensorMap, y: TensorMap) -> None: + # COMMENT default args StandardScaler + Ridge? But what parameter keys? + #if self.regressor is None: + # self.regressor_ = Ridge() + #if self.transformer is None: + # self.transformer_ = StandardScaler() + + self.transformer_ = deepcopy(self.transformer).fit(y) + self.regressor_ = deepcopy(self.regressor).fit(X, self.transformer_.transform(y)) + return self + + def predict(self, X: TensorMap) -> TensorMap: + return self.transformer_.inverse_transform(self.regressor_.predict(X)) + + def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> float: + y_pred = self.transformer_.inverse_transform(self.regressor_.predict(X)) + return rmse(y, y_pred, parameter_key) + + +class Pipeline: + r"""sklearn style Pipeline + + :param steps: list of (name, transformer/estimator) tuples that are applied in the fit and prediction steps consecutiely in the given order + last tuple must contain estimator, all other tuples must contain transformers + #COMMENT sklearn Pipeline supports more, but I think this limitation is okay for now + """ + def __init__(self, steps=None): + self.steps = steps + + def fit(self, X : TensorMap, y : TensorMap) -> None: + # TODO check if they have fit function, for this we require base clases for estimator and transformers + for step_idx, (name, transformer) in enumerate(self.steps[:-1]): + fitted_transformer = deepcopy(transformer).fit(X, y) + X = fitted_transformer.transform(X) + self.steps[step_idx] = (name, fitted_transformer) + + final_estimator = self.steps[-1][1] + final_estimator.fit(X,y) + return self + + def predict(self, X: TensorMap) -> TensorMap: + # TODO check they are fitted, for this we require base clases for estimator and transformers + for _, transformer in self.steps[:-1]: + X = transformer.transform(X) + return self.steps[-1][1].predict(X) + + def score(self, X: TensorMap, y: TensorMap, parameter_key: Optional[str]) -> float: + # COMMENT we use scorer from last estimator, maybe not what we want + for _, transformer in self.steps[:-1]: + X = transformer.transform(X) + final_estimator = self.steps[-1][1] + # COMMENT: what do you think about using default parameter_keys? + if parameter_key is None: + scores = np.zeros(len(final_estimator.parameter_keys)) + for i, parameter_key in enumerate(final_estimator.parameter_keys): + scores[i] = final_estimator.score(X, y, parameter_key)[0] + return np.mean(scores) + else: + return final_estimator.score(X, y, parameter_key) diff --git a/src/equisolve/numpy/models/linear_model.py b/src/equisolve/numpy/models/linear_model.py index febc87e..6a1712e 100644 --- a/src/equisolve/numpy/models/linear_model.py +++ b/src/equisolve/numpy/models/linear_model.py @@ -61,6 +61,8 @@ def __init__( self.coef_ = None + self.alpha = tensor_map_to_dict(self.alpha) + def _validate_data(self, X: TensorMap, y: Optional[TensorMap] = None) -> None: """Validates :class:`equistore.TensorBlock`'s for the usage in models. @@ -191,9 +193,10 @@ def fit( ) coef_blocks.append(coef_block) - # Convert alpha to a dictionary to be used in external models. - self.alpha = tensor_map_to_dict(self.alpha) self.coef_ = TensorMap(X.keys, coef_blocks) + # Convert alpha and coefs to a dictionary to be used in external models. + self.alpha = tensor_map_to_dict(self.alpha) + self.coef_ = tensor_map_to_dict(self.coef_) return self @@ -207,9 +210,12 @@ def predict(self, X: TensorMap) -> TensorMap: if self.coef_ is None: raise ValueError("No weights. Call fit method first.") - return dot(X, self.coef_) + self.coef_ = dict_to_tensor_map(self.coef_) + y_pred = dot(X, self.coef_) + self.coef_ = tensor_map_to_dict(self.coef_) + return y_pred - def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> List[float]: + def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> List[float]: # COMMENT why does it return list of floats if we just allow one paramater_key? """Return the coefficient of determination of the prediction. :param X: Test samples diff --git a/tests/numpy/compose/test_base.py b/tests/numpy/compose/test_base.py new file mode 100644 index 0000000..44cf060 --- /dev/null +++ b/tests/numpy/compose/test_base.py @@ -0,0 +1,58 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# +# Copyright (c) 2023 Authors and contributors +# (see the AUTHORS.rst file for the full list of names) +# +# Released under the BSD 3-Clause "New" or "Revised" License +# SPDX-License-Identifier: BSD-3-Clause + +import os + +import ase.io +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from equisolve.numpy.compose import TransformedTargetRegressor +from equisolve.numpy.models import Ridge +from equisolve.numpy.preprocessing import StandardScaler + +from equisolve.numpy.utils import matrix_to_block, tensor_to_tensormap + + + +class TestTransformedTargetRegressor: + """Test the TransformedTargetRegressor.""" + + rng = np.random.default_rng(0x122578748CFF12AD12) + + num_properties = np.array([91, 100, 119, 221, 256]) + num_targets = np.array([1000, 2187]) + + @pytest.mark.parametrize("num_properties", num_properties) + @pytest.mark.parametrize("num_targets", num_targets) + def test_ttr(self, num_properties, num_targets): + """Test if ridge is working and all shapes are converted correctly. + + Test is performed for two blocks. + """ + # Create input values + X_arr = self.rng.random([2, num_targets, num_properties]) + y_arr = self.rng.random([2, num_targets, 1]) + alpha_arr = np.ones([2, 1, num_properties]) + # TODO add as soon weights are supported + sw_arr = np.ones([2, num_targets, 1]) + + X = tensor_to_tensormap(X_arr) + y = tensor_to_tensormap(y_arr) + alpha = tensor_to_tensormap(alpha_arr) + # TODO add as soon weights are supported + #sw = tensor_to_tensormap(sw_arr) + + parameter_keys = "values" + ridge = Ridge(parameter_keys=parameter_keys, alpha=alpha) + standardizer = StandardScaler(parameter_keys=parameter_keys) + clf = TransformedTargetRegressor(regressor=ridge, transformer=standardizer) + clf.fit(X=X, y=y) + clf.predict(X=X) # TODO put this in a separate test, right now its just testing if it runs through + clf.score(X=X) # TODO put this in a separate test, right now its just testing if it runs through