From 3336dad0a0e50e191a6aef1854bf72297718a3bd Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Tue, 9 Sep 2025 17:04:31 +0200 Subject: [PATCH 1/6] Initial implementation of SmartGradient Co-authored-by: esmail-abdulfattah --- src/dalia/core/dalia.py | 108 ++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 15 deletions(-) diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py index 5756d086..6b00303c 100644 --- a/src/dalia/core/dalia.py +++ b/src/dalia/core/dalia.py @@ -5,6 +5,8 @@ from scipy import optimize from tabulate import tabulate +import numpy as np + from dalia import ArrayLike, NDArray, backend_flags, comm_rank, comm_size, sp, xp from dalia.configs.dalia_config import DaliaConfig from dalia.core.model import Model @@ -193,6 +195,7 @@ def __init__( # --- Set up recurrent variables self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64) self.f_values_i = xp.zeros(self.n_f_evaluations, dtype=xp.float64) + self.gradient_basis = xp.eye(self.model.n_hyperparameters, dtype=xp.float64) self.eps_mat = xp.zeros( (self.model.n_hyperparameters, self.model.n_hyperparameters), dtype=xp.float64, @@ -220,6 +223,22 @@ def __init__( logging.info("DALIA initialized.") print_msg("DALIA initialized.", flush=True) + + + + # SMART GRADIENT STUFF + # self.fd_step = xp.cbrt(xp.finfo(float).eps) + self.fd_step = 1e-3 + + self.noise_stddev = 7e-8 # To avoid singularity during QR + + self.G = xp.identity(self.model.n_hyperparameters) + self.c_G = xp.identity(self.model.n_hyperparameters) + self.prev_theta = xp.zeros(self.model.n_hyperparameters) + self.curr_theta = xp.zeros(self.model.n_hyperparameters) + self.count = 0 + self.rng = np.random.default_rng() + def _print_init(self) -> None: """ Print informations about the DALIA solver. @@ -529,6 +548,75 @@ def callback(intermediate_result: optimize.OptimizeResult): return self.minimization_result + def _transformed_fun(self, phi): + return self.curr_theta + self.G @ phi + + def _scale(self, x): + mean = xp.mean(x) + std = xp.std(x, ddof=1) + if std < 1e-12: + return x - mean + return (x - mean) / std + + def _update_G(self, current_theta): + self.curr_theta = current_theta + self.c_G = xp.roll(self.c_G, 1, axis=1) + xdiff = current_theta - self.prev_theta + xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters)) + self.c_G[:, 0] = self._scale(xdiff) + self.G = self.c_G + + def _orthogonalize_G(self): + try: + Q, R = xp.linalg.qr(self.G) + self.G = Q + except xp.linalg.LinAlgError: + print("Warning: QR decomposition failed. Resetting G to identity.") + self.G = xp.identity(self.model.n_hyperparameters) + + def _get_original_grad(self, transformed_grad): + return xp.linalg.solve(self.G.T, transformed_grad) + + def _update_gradient_basis(self) -> None: + for i in range(self.model.n_hyperparameters): + self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2] + + def _construct_f_evaluation_points(self, theta_i: NDArray) -> None: + if self.count > 0: + self._update_G(theta_i) + else: + self.curr_theta = theta_i # Set the starting point + + self._orthogonalize_G() + + self.prev_theta = xp.copy(theta_i) + self.count += 1 + + # Initialize central difference scheme matrix + self.eps_mat[:] = self.fd_step * self.gradient_basis + self.theta_mat[:] = xp.zeros( + (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64 + ) + + self.curr_theta.T + + self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T) + + self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat + self.theta_mat[ + :, self.model.n_hyperparameters + 1 : self.n_f_evaluations + ] -= self.eps_mat + + for i in range(1, self.n_f_evaluations): + self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T + + def _compute_gradient(self): + for i in range(self.model.n_hyperparameters): + self.gradient_f[i] = ( + self.f_values_i[i + 1] + - self.f_values_i[self.model.n_hyperparameters + i + 1] + ) / (2 * self.fd_step) + def _objective_function( self, theta_i: NDArray, @@ -563,15 +651,9 @@ def _objective_function( for i in range(self.n_f_evaluations): task_mapping.append(i % n_feval_comm) - # Initialize central difference scheme matrix - self.eps_mat[:] = self.eps_gradient_f * xp.eye(self.model.n_hyperparameters) - self.theta_mat[:] = xp.repeat( - get_device(theta_i).reshape(-1, 1), self.n_f_evaluations, axis=1 - ) - self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat - self.theta_mat[ - :, self.model.n_hyperparameters + 1 : self.n_f_evaluations - ] -= self.eps_mat + self._construct_f_evaluation_points(theta_i=get_device(theta_i)) + + # Proceed to the parallel function evaluation for feval_i in range(self.n_f_evaluations - 1, -1, -1): @@ -593,14 +675,10 @@ def _objective_function( synchronize(comm=self.comm_world) # Compute gradient using central difference scheme - for i in range(self.model.n_hyperparameters): - self.gradient_f[i] = ( - self.f_values_i[i + 1] - - self.f_values_i[self.model.n_hyperparameters + i + 1] - ) / (2 * self.eps_gradient_f) + self._compute_gradient() f_0 = get_host(self.f_values_i[0]) - grad_f = get_host(self.gradient_f) + grad_f = get_host(self._get_original_grad(self.gradient_f)) synchronize(comm=self.comm_world) toc = time.perf_counter() From 537df2ecb6a81bd4b7597d6790600cd70e48cc36 Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Wed, 10 Sep 2025 11:40:13 +0200 Subject: [PATCH 2/6] draft of clean implementation of GradientMethod --- src/dalia/configs/dalia_config.py | 1 - src/dalia/configs/gradient_method_config.py | 22 ++++ src/dalia/core/dalia.py | 95 ++------------- src/dalia/core/gradient_method.py | 19 +++ src/dalia/gradient_methods/__init__.py | 6 + src/dalia/gradient_methods/smart_gradient.py | 111 ++++++++++++++++++ .../gradient_methods/vanilla_gradient.py | 22 ++++ 7 files changed, 188 insertions(+), 88 deletions(-) create mode 100644 src/dalia/configs/gradient_method_config.py create mode 100644 src/dalia/core/gradient_method.py create mode 100644 src/dalia/gradient_methods/__init__.py create mode 100644 src/dalia/gradient_methods/smart_gradient.py create mode 100644 src/dalia/gradient_methods/vanilla_gradient.py diff --git a/src/dalia/configs/dalia_config.py b/src/dalia/configs/dalia_config.py index 0dbbbec0..b37dd736 100644 --- a/src/dalia/configs/dalia_config.py +++ b/src/dalia/configs/dalia_config.py @@ -28,7 +28,6 @@ class BFGSConfig(BaseModel): # c1: float = 1e-4 # only relevant for BFGS not for L-BFGS-B # c2: float = 0.9 # only relevant for BFGS not for L-BFGS-B disp: bool = False - class DaliaConfig(BaseModel): diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py new file mode 100644 index 00000000..9782e342 --- /dev/null +++ b/src/dalia/configs/gradient_method_config.py @@ -0,0 +1,22 @@ +from abc import ABC +from typing import Literal + +from pydantic import BaseModel, ConfigDict + + +class GradientMethodConfig(BaseModel, ABC): + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) + + # Input folder for this specific submodel + input_dir: str = None + type: Literal["vanilla_gradient", "smart_gradient"] = "smart_gradient" + + finite_difference_epsilon: float = 1e-3 + +class VanillaGradientConfig(GradientMethodConfig): ... + +class SmartGradientConfig(GradientMethodConfig): + # The diagonal noise ensure to avoid singularities in the QR decomposition + diagonal_noise: float = 1e-8 + # Threshold below which scaling is not performed + scaling_threashold: float = 1e-12 \ No newline at end of file diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py index 6b00303c..2ac41088 100644 --- a/src/dalia/core/dalia.py +++ b/src/dalia/core/dalia.py @@ -11,6 +11,7 @@ from dalia.configs.dalia_config import DaliaConfig from dalia.core.model import Model from dalia.solvers import DenseSolver, DistSerinvSolver, SerinvSolver, SparseSolver +from dalia.gradient_methods import VanillaGradient from dalia.utils import ( DummyCommunicator, add_str_header, @@ -192,6 +193,9 @@ def __init__( nccl_comm=self.nccl_comm, ) + # --- Initialize Gradient Method + self.gradient_method = VanillaGradient(basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3) + # --- Set up recurrent variables self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64) self.f_values_i = xp.zeros(self.n_f_evaluations, dtype=xp.float64) @@ -223,22 +227,6 @@ def __init__( logging.info("DALIA initialized.") print_msg("DALIA initialized.", flush=True) - - - - # SMART GRADIENT STUFF - # self.fd_step = xp.cbrt(xp.finfo(float).eps) - self.fd_step = 1e-3 - - self.noise_stddev = 7e-8 # To avoid singularity during QR - - self.G = xp.identity(self.model.n_hyperparameters) - self.c_G = xp.identity(self.model.n_hyperparameters) - self.prev_theta = xp.zeros(self.model.n_hyperparameters) - self.curr_theta = xp.zeros(self.model.n_hyperparameters) - self.count = 0 - self.rng = np.random.default_rng() - def _print_init(self) -> None: """ Print informations about the DALIA solver. @@ -548,75 +536,6 @@ def callback(intermediate_result: optimize.OptimizeResult): return self.minimization_result - def _transformed_fun(self, phi): - return self.curr_theta + self.G @ phi - - def _scale(self, x): - mean = xp.mean(x) - std = xp.std(x, ddof=1) - if std < 1e-12: - return x - mean - return (x - mean) / std - - def _update_G(self, current_theta): - self.curr_theta = current_theta - self.c_G = xp.roll(self.c_G, 1, axis=1) - xdiff = current_theta - self.prev_theta - xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters)) - self.c_G[:, 0] = self._scale(xdiff) - self.G = self.c_G - - def _orthogonalize_G(self): - try: - Q, R = xp.linalg.qr(self.G) - self.G = Q - except xp.linalg.LinAlgError: - print("Warning: QR decomposition failed. Resetting G to identity.") - self.G = xp.identity(self.model.n_hyperparameters) - - def _get_original_grad(self, transformed_grad): - return xp.linalg.solve(self.G.T, transformed_grad) - - def _update_gradient_basis(self) -> None: - for i in range(self.model.n_hyperparameters): - self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2] - - def _construct_f_evaluation_points(self, theta_i: NDArray) -> None: - if self.count > 0: - self._update_G(theta_i) - else: - self.curr_theta = theta_i # Set the starting point - - self._orthogonalize_G() - - self.prev_theta = xp.copy(theta_i) - self.count += 1 - - # Initialize central difference scheme matrix - self.eps_mat[:] = self.fd_step * self.gradient_basis - self.theta_mat[:] = xp.zeros( - (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64 - ) - - self.curr_theta.T - - self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T) - - self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat - self.theta_mat[ - :, self.model.n_hyperparameters + 1 : self.n_f_evaluations - ] -= self.eps_mat - - for i in range(1, self.n_f_evaluations): - self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T - - def _compute_gradient(self): - for i in range(self.model.n_hyperparameters): - self.gradient_f[i] = ( - self.f_values_i[i + 1] - - self.f_values_i[self.model.n_hyperparameters + i + 1] - ) / (2 * self.fd_step) - def _objective_function( self, theta_i: NDArray, @@ -651,9 +570,11 @@ def _objective_function( for i in range(self.n_f_evaluations): task_mapping.append(i % n_feval_comm) - self._construct_f_evaluation_points(theta_i=get_device(theta_i)) + self.gradient_method.get_evaluation_directions(direction_matrix=self.theta_mat) - + print(self.theta_mat) + + exit() # Proceed to the parallel function evaluation for feval_i in range(self.n_f_evaluations - 1, -1, -1): diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py new file mode 100644 index 00000000..f795140e --- /dev/null +++ b/src/dalia/core/gradient_method.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod + +from dalia import xp + + +class GradientMethod(ABC): + """Core class for gradient computation methods.""" + + def __init__(self, basis_size, finite_difference_epsilon): + self.basis = xp.identity((basis_size, basis_size), dtype=xp.float64) + self.finite_difference_epsilon = finite_difference_epsilon + + @abstractmethod + def get_evaluation_directions(self, direction_matrix) -> None: + ... + + @abstractmethod + def compute_gradient(self, gradient) -> None: + ... \ No newline at end of file diff --git a/src/dalia/gradient_methods/__init__.py b/src/dalia/gradient_methods/__init__.py new file mode 100644 index 00000000..95487c21 --- /dev/null +++ b/src/dalia/gradient_methods/__init__.py @@ -0,0 +1,6 @@ +# Copyright 2024-2025 DALIA authors. All rights reserved. + +from dalia.gradient_methods.vanilla_gradient import VanillaGradient +# from dalia.gradient_methods.smart_gradient import SmartGradient + +__all__ = ["VanillaGradient"] diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py new file mode 100644 index 00000000..0c82af65 --- /dev/null +++ b/src/dalia/gradient_methods/smart_gradient.py @@ -0,0 +1,111 @@ +from dalia.core.gradient_method import GradientMethod +from dalia import xp + + +class SmartGradient(GradientMethod): + """Smart gradient computation method. + + + References + ---------- + .. [1] Esmail Abdul Fattah, Janet Van Niekerk, Håvard Rue. + Smart Gradient - An adaptive technique for improving + gradient estimation. Foundations of Data Science, 2022, + 4(1): 123-136. doi: 10.3934/fods.2021037 + """ + + def __init__(self): + super().__init__() + + # SMART GRADIENT STUFF + # self.fd_step = xp.cbrt(xp.finfo(float).eps) + self.fd_step = 1e-3 + + self.noise_stddev = 7e-8 # To avoid singularity during QR + + self.G = xp.identity(self.model.n_hyperparameters) + self.c_G = xp.identity(self.model.n_hyperparameters) + self.prev_theta = xp.zeros(self.model.n_hyperparameters) + self.curr_theta = xp.zeros(self.model.n_hyperparameters) + self.count = 0 + self.rng = np.random.default_rng() + + def get_evaluation_directions(self): + ... + # theta_mat is called gradient_direction_for_finite_difference in INLA + # Could be gradient_directions_matrix + + def compute_gradient(self): + ... + + + + def _transformed_fun(self, phi): + return self.curr_theta + self.G @ phi + + def _scale(self, x): + mean = xp.mean(x) + std = xp.std(x, ddof=1) + if std < 1e-12: + return x - mean + return (x - mean) / std + + def _update_G(self, current_theta): + self.curr_theta = current_theta + self.c_G = xp.roll(self.c_G, 1, axis=1) + xdiff = current_theta - self.prev_theta + xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters)) + self.c_G[:, 0] = self._scale(xdiff) + self.G = self.c_G + + def _orthogonalize_G(self): + try: + Q, R = xp.linalg.qr(self.G) + self.G = Q + except xp.linalg.LinAlgError: + print("Warning: QR decomposition failed. Resetting G to identity.") + self.G = xp.identity(self.model.n_hyperparameters) + + def _get_original_grad(self, transformed_grad): + return xp.linalg.solve(self.G.T, transformed_grad) + + def _update_gradient_basis(self) -> None: + for i in range(self.model.n_hyperparameters): + self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2] + + + def _construct_f_evaluation_points(self, theta_i: NDArray) -> None: + if self.count > 0: + self._update_G(theta_i) + else: + self.curr_theta = theta_i # Set the starting point + + self._orthogonalize_G() + + self.prev_theta = xp.copy(theta_i) + self.count += 1 + + # Initialize central difference scheme matrix + self.eps_mat[:] = self.fd_step * self.gradient_basis + self.theta_mat[:] = xp.zeros( + (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64 + ) + + self.curr_theta.T + + self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T) + + self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat + self.theta_mat[ + :, self.model.n_hyperparameters + 1 : self.n_f_evaluations + ] -= self.eps_mat + + for i in range(1, self.n_f_evaluations): + self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T + + def _compute_gradient(self): + for i in range(self.model.n_hyperparameters): + self.gradient_f[i] = ( + self.f_values_i[i + 1] + - self.f_values_i[self.model.n_hyperparameters + i + 1] + ) / (2 * self.fd_step) diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py new file mode 100644 index 00000000..986b9978 --- /dev/null +++ b/src/dalia/gradient_methods/vanilla_gradient.py @@ -0,0 +1,22 @@ +from dalia.core.gradient_method import GradientMethod +from dalia import xp + + + +class VanillaGradient(GradientMethod): + """Vanilla finite difference gradient computation method.""" + + def __init__(self, basis_size, finite_difference_epsilon): + super().__init__(basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon) + + def get_evaluation_directions(self, direction_matrix) -> None: + direction_matrix[:, 1 : self.basis.shape[0] + 1] += self.finite_difference_epsilon * self.basis + direction_matrix[:, 1 + self.basis.shape[0]:] -= self.finite_difference_epsilon * self.basis + + print(direction_matrix) + + def compute_gradient(self, gradient) -> None: + for i in range(self.model.n_hyperparameters): + gradient[i] = (self.f_evaluations[2 * i + 1] - self.f_evaluations[2 * i]) / ( + 2.0 * self.h + ) \ No newline at end of file From 7fb2fd5a2cf0ed70605bb3857957503354b31c6a Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Thu, 11 Sep 2025 11:40:51 +0200 Subject: [PATCH 3/6] separated gradient_method from core DALIA class --- src/dalia/core/dalia.py | 38 +++++++++++++------ src/dalia/core/gradient_method.py | 8 ++-- .../gradient_methods/vanilla_gradient.py | 37 +++++++++++++----- 3 files changed, 56 insertions(+), 27 deletions(-) diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py index 2ac41088..b19a1088 100644 --- a/src/dalia/core/dalia.py +++ b/src/dalia/core/dalia.py @@ -5,8 +5,6 @@ from scipy import optimize from tabulate import tabulate -import numpy as np - from dalia import ArrayLike, NDArray, backend_flags, comm_rank, comm_size, sp, xp from dalia.configs.dalia_config import DaliaConfig from dalia.core.model import Model @@ -194,7 +192,9 @@ def __init__( ) # --- Initialize Gradient Method - self.gradient_method = VanillaGradient(basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3) + self.gradient_method = VanillaGradient( + basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3 + ) # --- Set up recurrent variables self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64) @@ -343,7 +343,7 @@ def run(self) -> dict: } synchronize(comm=self.comm_world) toc = time.perf_counter() - print_msg(f"DALIA inference took: {toc - tic:0.4f} (s)", flush = True) + print_msg(f"DALIA inference took: {toc - tic:0.4f} (s)", flush=True) return results def minimize(self) -> optimize.OptimizeResult: @@ -570,12 +570,12 @@ def _objective_function( for i in range(self.n_f_evaluations): task_mapping.append(i % n_feval_comm) - self.gradient_method.get_evaluation_directions(direction_matrix=self.theta_mat) + self.gradient_method.get_evaluation_directions( + direction_matrix=self.theta_mat, theta=theta_i + ) print(self.theta_mat) - exit() - # Proceed to the parallel function evaluation for feval_i in range(self.n_f_evaluations - 1, -1, -1): # Perform the evaluation in reverse order so that the stored and returned @@ -596,10 +596,12 @@ def _objective_function( synchronize(comm=self.comm_world) # Compute gradient using central difference scheme - self._compute_gradient() + self.gradient_method.compute_gradient( + function_evaluations=self.f_values_i, gradient=self.gradient_f + ) f_0 = get_host(self.f_values_i[0]) - grad_f = get_host(self._get_original_grad(self.gradient_f)) + grad_f = get_host(self.gradient_f) synchronize(comm=self.comm_world) toc = time.perf_counter() @@ -616,6 +618,14 @@ def _objective_function( ) self.iter += 1 + print(f"self.f_values_i: {self.f_values_i}") + print(f"self.gradient_f: {self.gradient_f}") + print(f"f_0: {f_0}") + print(f"grad_f: {grad_f}") + + """ if self.iter == 2: + exit() """ + return (f_0, grad_f) def _evaluate_f( @@ -793,11 +803,15 @@ def compute_covariance_hp(self, theta_i: NDArray) -> NDArray: # flush=True, # ) cov_theta = xp.linalg.inv(hess_theta) - synchronize(comm=self.comm_world) + synchronize(comm=self.comm_world) toc = time.perf_counter() t_covariance_hp = toc - tic - print_msg("Time to compute covariance of hyperparameters:", t_covariance_hp, flush=True) - + print_msg( + "Time to compute covariance of hyperparameters:", + t_covariance_hp, + flush=True, + ) + return cov_theta def _evaluate_hessian_f( diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py index f795140e..2deadc18 100644 --- a/src/dalia/core/gradient_method.py +++ b/src/dalia/core/gradient_method.py @@ -7,13 +7,11 @@ class GradientMethod(ABC): """Core class for gradient computation methods.""" def __init__(self, basis_size, finite_difference_epsilon): - self.basis = xp.identity((basis_size, basis_size), dtype=xp.float64) + self.basis = xp.identity(basis_size, dtype=xp.float64) self.finite_difference_epsilon = finite_difference_epsilon @abstractmethod - def get_evaluation_directions(self, direction_matrix) -> None: - ... + def get_evaluation_directions(self, direction_matrix, theta) -> None: ... @abstractmethod - def compute_gradient(self, gradient) -> None: - ... \ No newline at end of file + def compute_gradient(self, function_evaluations, gradient) -> None: ... diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py index 986b9978..a6f66dba 100644 --- a/src/dalia/gradient_methods/vanilla_gradient.py +++ b/src/dalia/gradient_methods/vanilla_gradient.py @@ -1,22 +1,39 @@ from dalia.core.gradient_method import GradientMethod from dalia import xp +from dalia.utils import get_device class VanillaGradient(GradientMethod): """Vanilla finite difference gradient computation method.""" def __init__(self, basis_size, finite_difference_epsilon): - super().__init__(basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon) + super().__init__( + basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon + ) - def get_evaluation_directions(self, direction_matrix) -> None: - direction_matrix[:, 1 : self.basis.shape[0] + 1] += self.finite_difference_epsilon * self.basis - direction_matrix[:, 1 + self.basis.shape[0]:] -= self.finite_difference_epsilon * self.basis + def get_evaluation_directions(self, direction_matrix, theta) -> None: + direction_matrix.fill(0.0) + direction_matrix[:, 0] = get_device(theta) - print(direction_matrix) + print(f"theta: {theta}") - def compute_gradient(self, gradient) -> None: - for i in range(self.model.n_hyperparameters): - gradient[i] = (self.f_evaluations[2 * i + 1] - self.f_evaluations[2 * i]) / ( - 2.0 * self.h - ) \ No newline at end of file + direction_matrix[ + :, 1 : self.basis.shape[0] + 1 + ] += self.finite_difference_epsilon * self.basis + xp.repeat( + get_device(theta).reshape(-1, 1), self.basis.shape[0], 1 + ) + direction_matrix[ + :, 1 + self.basis.shape[0] : + ] -= self.finite_difference_epsilon * self.basis - xp.repeat( + get_device(theta).reshape(-1, 1), self.basis.shape[0], 1 + ) + + def compute_gradient(self, function_evaluations, gradient) -> None: + + print(f"IG: function_evaluations: {function_evaluations}") + for i in range(self.basis.shape[0]): + gradient[i] = ( + function_evaluations[i + 1] + - function_evaluations[self.basis.shape[0] + i + 1] + ) / (2.0 * self.finite_difference_epsilon) From c38327f2021a228cd357edc781901e9246988be6 Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Thu, 11 Sep 2025 12:30:00 +0200 Subject: [PATCH 4/6] added config and structure for smart_gradient however smart_gradient needs to be debugged --- src/dalia/configs/dalia_config.py | 17 +- src/dalia/configs/gradient_method_config.py | 11 +- src/dalia/core/dalia.py | 21 +- src/dalia/core/gradient_method.py | 50 +++- src/dalia/gradient_methods/__init__.py | 4 +- src/dalia/gradient_methods/smart_gradient.py | 240 ++++++++++++------ .../gradient_methods/vanilla_gradient.py | 61 ++++- 7 files changed, 297 insertions(+), 107 deletions(-) diff --git a/src/dalia/configs/dalia_config.py b/src/dalia/configs/dalia_config.py index b37dd736..c8d2a015 100644 --- a/src/dalia/configs/dalia_config.py +++ b/src/dalia/configs/dalia_config.py @@ -6,6 +6,12 @@ from pydantic import BaseModel, ConfigDict, PositiveInt +from dalia.configs.gradient_method_config import ( + GradientMethodConfig, + SmartGradientConfig, + VanillaGradientConfig, +) + class SolverConfig(BaseModel): model_config = ConfigDict(extra="forbid") @@ -20,9 +26,11 @@ class BFGSConfig(BaseModel): max_iter: PositiveInt = 100 jac: bool = True - - maxcor: PositiveInt = 10 # maximum number of past gradient vectors to store -> good default: dim(theta) - maxls: PositiveInt = 20 # maximum number of line search iterations + + maxcor: PositiveInt = ( + 10 # maximum number of past gradient vectors to store -> good default: dim(theta) + ) + maxls: PositiveInt = 20 # maximum number of line search iterations gtol: float = 1e-1 # c1: float = 1e-4 # only relevant for BFGS not for L-BFGS-B @@ -36,11 +44,12 @@ class DaliaConfig(BaseModel): # --- Simulation parameters ------------------------------------------------ solver: SolverConfig = SolverConfig() minimize: BFGSConfig = BFGSConfig() + gradient_method: GradientMethodConfig = SmartGradientConfig() # exit BFGS early if the reduction in the objective function is less than f_reduction_tol after f_reduction_lag iterations f_reduction_lag: int = 3 f_reduction_tol: float = 1e-4 - + # exit BFGS early if the change in theta is less than theta_reduction_tol after theta_reduction_lag iterations theta_reduction_lag: int = 3 theta_reduction_tol: float = 1e-4 diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py index 9782e342..12d56bf1 100644 --- a/src/dalia/configs/gradient_method_config.py +++ b/src/dalia/configs/gradient_method_config.py @@ -9,14 +9,19 @@ class GradientMethodConfig(BaseModel, ABC): # Input folder for this specific submodel input_dir: str = None - type: Literal["vanilla_gradient", "smart_gradient"] = "smart_gradient" + type: Literal["vanilla_gradient", "smart_gradient"] = None finite_difference_epsilon: float = 1e-3 -class VanillaGradientConfig(GradientMethodConfig): ... + +class VanillaGradientConfig(GradientMethodConfig): + type: Literal["vanilla_gradient"] = "vanilla_gradient" + class SmartGradientConfig(GradientMethodConfig): + type: Literal["smart_gradient"] = "smart_gradient" + # The diagonal noise ensure to avoid singularities in the QR decomposition diagonal_noise: float = 1e-8 # Threshold below which scaling is not performed - scaling_threashold: float = 1e-12 \ No newline at end of file + scaling_threshold: float = 1e-12 diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py index b19a1088..083b5025 100644 --- a/src/dalia/core/dalia.py +++ b/src/dalia/core/dalia.py @@ -9,7 +9,7 @@ from dalia.configs.dalia_config import DaliaConfig from dalia.core.model import Model from dalia.solvers import DenseSolver, DistSerinvSolver, SerinvSolver, SparseSolver -from dalia.gradient_methods import VanillaGradient +from dalia.gradient_methods import VanillaGradient, SmartGradient from dalia.utils import ( DummyCommunicator, add_str_header, @@ -192,9 +192,22 @@ def __init__( ) # --- Initialize Gradient Method - self.gradient_method = VanillaGradient( - basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3 - ) + if self.config.gradient_method.type == "vanilla_gradient": + self.gradient_method = VanillaGradient( + basis_size=self.model.n_hyperparameters, + finite_difference_epsilon=self.config.gradient_method.finite_difference_epsilon, + ) + elif self.config.gradient_method.type == "smart_gradient": + self.gradient_method = SmartGradient( + basis_size=self.model.n_hyperparameters, + finite_difference_epsilon=self.config.gradient_method.finite_difference_epsilon, + diagonal_noise=self.config.gradient_method.diagonal_noise, + scaling_threshold=self.config.gradient_method.scaling_threshold, + ) + else: + raise ValueError( + f"Unknown gradient method type: {self.config.gradient_method.type}" + ) # --- Set up recurrent variables self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64) diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py index 2deadc18..3bf30bf1 100644 --- a/src/dalia/core/gradient_method.py +++ b/src/dalia/core/gradient_method.py @@ -6,12 +6,54 @@ class GradientMethod(ABC): """Core class for gradient computation methods.""" - def __init__(self, basis_size, finite_difference_epsilon): - self.basis = xp.identity(basis_size, dtype=xp.float64) + def __init__(self, basis_size, finite_difference_epsilon) -> None: + """Initialize the gradient computation method. + + Parameters + ---------- + basis_size : int + The size of the basis for finite differences. + finite_difference_epsilon : float + The epsilon value for finite difference computations. + + Returns + ------- + None + """ + self.basis_size = basis_size + self.basis = xp.identity(self.basis_size, dtype=xp.float64) self.finite_difference_epsilon = finite_difference_epsilon @abstractmethod - def get_evaluation_directions(self, direction_matrix, theta) -> None: ... + def get_evaluation_directions(self, direction_matrix, theta) -> None: + """Get the evaluation directions for the gradient computation. + + Parameters + ---------- + direction_matrix : xp.ndarray + The matrix to store the evaluation directions. + theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ + ... @abstractmethod - def compute_gradient(self, function_evaluations, gradient) -> None: ... + def compute_gradient(self, function_evaluations, gradient) -> None: + """Compute the gradient using finite differences. + + Parameters + ---------- + function_evaluations : xp.ndarray + The function evaluations at the current and perturbed points. + gradient : xp.ndarray + The array to store the computed gradient. + + Returns + ------- + None + """ + ... diff --git a/src/dalia/gradient_methods/__init__.py b/src/dalia/gradient_methods/__init__.py index 95487c21..4e807656 100644 --- a/src/dalia/gradient_methods/__init__.py +++ b/src/dalia/gradient_methods/__init__.py @@ -1,6 +1,6 @@ # Copyright 2024-2025 DALIA authors. All rights reserved. from dalia.gradient_methods.vanilla_gradient import VanillaGradient -# from dalia.gradient_methods.smart_gradient import SmartGradient +from dalia.gradient_methods.smart_gradient import SmartGradient -__all__ = ["VanillaGradient"] +__all__ = ["VanillaGradient", "SmartGradient"] diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py index 0c82af65..71d75f7c 100644 --- a/src/dalia/gradient_methods/smart_gradient.py +++ b/src/dalia/gradient_methods/smart_gradient.py @@ -1,111 +1,193 @@ -from dalia.core.gradient_method import GradientMethod +from numpy import random as rand + from dalia import xp +from dalia.core.gradient_method import GradientMethod +from dalia.utils import get_device + class SmartGradient(GradientMethod): """Smart gradient computation method. - - + + This method adaptively updates the basis used for finite difference + gradient estimation based on the changes in the (hyper)parameter values. + It employs QR decomposition to maintain an orthogonal basis and scales + the updates to ensure numerical stability. + References ---------- - .. [1] Esmail Abdul Fattah, Janet Van Niekerk, Håvard Rue. - Smart Gradient - An adaptive technique for improving - gradient estimation. Foundations of Data Science, 2022, + .. [1] Esmail Abdul Fattah, Janet Van Niekerk, Håvard Rue. + Smart Gradient - An adaptive technique for improving + gradient estimation. Foundations of Data Science, 2022, 4(1): 123-136. doi: 10.3934/fods.2021037 """ - def __init__(self): - super().__init__() - - # SMART GRADIENT STUFF - # self.fd_step = xp.cbrt(xp.finfo(float).eps) - self.fd_step = 1e-3 - - self.noise_stddev = 7e-8 # To avoid singularity during QR - - self.G = xp.identity(self.model.n_hyperparameters) - self.c_G = xp.identity(self.model.n_hyperparameters) - self.prev_theta = xp.zeros(self.model.n_hyperparameters) - self.curr_theta = xp.zeros(self.model.n_hyperparameters) + def __init__( + self, + basis_size, + finite_difference_epsilon, + diagonal_noise=1e-8, + scaling_threshold=1e-12, + ) -> None: + """Initialize the gradient computation method. + + Parameters + ---------- + basis_size : int + The size of the basis for finite differences. + finite_difference_epsilon : float + The epsilon value for finite difference computations. + diagonal_noise : float + The diagonal noise to avoid singularities in the QR decomposition. + scaling_threshold : float + The threshold below which scaling is not performed. + + Returns + ------- + None + """ + super().__init__( + basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon + ) + self.diagonal_noise = diagonal_noise + self.scaling_threshold = scaling_threshold + self.temp_basis = xp.identity(self.basis_size) + self.prev_theta = xp.zeros(self.basis_size) + self.curr_theta = xp.zeros(self.basis_size) self.count = 0 - self.rng = np.random.default_rng() - - def get_evaluation_directions(self): - ... - # theta_mat is called gradient_direction_for_finite_difference in INLA - # Could be gradient_directions_matrix - - def compute_gradient(self): - ... - - - - def _transformed_fun(self, phi): - return self.curr_theta + self.G @ phi - - def _scale(self, x): + self.rng = rand.default_rng() + + def _transformed_fun(self, phi) -> xp.ndarray: + """Transform the input using the current theta and basis. + + Parameters + ---------- + phi : xp.ndarray + The input to transform. + + Returns + ------- + xp.ndarray + The transformed output. + """ + return self.curr_theta + self.basis @ phi + + def _scale(self, x) -> xp.ndarray: + """Scale the input vector. + + Parameters + ---------- + x : xp.ndarray + The input vector to scale. + + Returns + ------- + xp.ndarray + The scaled output vector. + """ mean = xp.mean(x) std = xp.std(x, ddof=1) - if std < 1e-12: + if std < self.scaling_threshold: return x - mean return (x - mean) / std - def _update_G(self, current_theta): + def _update_basis(self, current_theta) -> None: + """Update the basis using the current theta. + + Parameters + ---------- + current_theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ self.curr_theta = current_theta - self.c_G = xp.roll(self.c_G, 1, axis=1) + self.temp_basis = xp.roll(self.temp_basis, 1, axis=1) xdiff = current_theta - self.prev_theta - xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters)) - self.c_G[:, 0] = self._scale(xdiff) - self.G = self.c_G + xdiff += get_device(self.rng.normal(0.0, self.diagonal_noise, self.basis_size)) + self.temp_basis[:, 0] = self._scale(xdiff) + self.basis = self.temp_basis - def _orthogonalize_G(self): + def _orthogonalize_basis(self) -> None: + """Orthogonalize the basis using QR decomposition. + + Parameters + ---------- + None + + Returns + ------- + None + """ try: - Q, R = xp.linalg.qr(self.G) - self.G = Q + Q, R = xp.linalg.qr(self.basis) + self.basis = Q except xp.linalg.LinAlgError: print("Warning: QR decomposition failed. Resetting G to identity.") - self.G = xp.identity(self.model.n_hyperparameters) + self.basis = xp.identity(self.basis_size) + + def get_evaluation_directions(self, direction_matrix, theta) -> None: + """Get the evaluation directions for the gradient computation. - def _get_original_grad(self, transformed_grad): - return xp.linalg.solve(self.G.T, transformed_grad) + Parameters + ---------- + direction_matrix : xp.ndarray + The matrix to store the evaluation directions. + theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ + theta_dev = get_device(theta) + direction_matrix.fill(0.0) - def _update_gradient_basis(self) -> None: - for i in range(self.model.n_hyperparameters): - self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2] - - - def _construct_f_evaluation_points(self, theta_i: NDArray) -> None: if self.count > 0: - self._update_G(theta_i) + self._update_basis(theta_dev) else: - self.curr_theta = theta_i # Set the starting point + self.curr_theta = theta_dev # Set the starting point - self._orthogonalize_G() + self._orthogonalize_basis() - self.prev_theta = xp.copy(theta_i) + self.prev_theta = xp.copy(theta_dev) self.count += 1 - # Initialize central difference scheme matrix - self.eps_mat[:] = self.fd_step * self.gradient_basis - self.theta_mat[:] = xp.zeros( - (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64 + # First column is the current theta + direction_matrix[:, 0] = theta_dev + # Next basis_size columns are + epsilon * e_i + direction_matrix[:, 1 : 1 + self.basis_size] += ( + self.finite_difference_epsilon * self.basis + ) + # Next basis_size columns are - epsilon * e_i + direction_matrix[:, self.basis_size + 1 :] -= ( + self.finite_difference_epsilon * self.basis ) - self.curr_theta.T - - self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T) - - self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat - self.theta_mat[ - :, self.model.n_hyperparameters + 1 : self.n_f_evaluations - ] -= self.eps_mat - - for i in range(1, self.n_f_evaluations): - self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T - - def _compute_gradient(self): - for i in range(self.model.n_hyperparameters): - self.gradient_f[i] = ( - self.f_values_i[i + 1] - - self.f_values_i[self.model.n_hyperparameters + i + 1] - ) / (2 * self.fd_step) + for i in range(1, direction_matrix.shape[1]): + direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i]).T + + def compute_gradient(self, function_evaluations, gradient) -> None: + """Compute the gradient using finite differences. + + Parameters + ---------- + function_evaluations : xp.ndarray + The function evaluations at the current and perturbed points. + gradient : xp.ndarray + The array to store the computed gradient. + + Returns + ------- + None + """ + for i in range(self.basis_size): + gradient[i] = ( + function_evaluations[i + 1] + - function_evaluations[self.basis_size + i + 1] + ) / (2.0 * self.finite_difference_epsilon) + + # Transform back the gradient into the original basis + gradient[:] = xp.linalg.solve(self.basis.T, gradient) diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py index a6f66dba..20bdabde 100644 --- a/src/dalia/gradient_methods/vanilla_gradient.py +++ b/src/dalia/gradient_methods/vanilla_gradient.py @@ -7,33 +7,72 @@ class VanillaGradient(GradientMethod): """Vanilla finite difference gradient computation method.""" - def __init__(self, basis_size, finite_difference_epsilon): + def __init__(self, basis_size, finite_difference_epsilon) -> None: + """Initialize the gradient computation method. + + Parameters + ---------- + basis_size : int + The size of the basis for finite differences. + finite_difference_epsilon : float + The epsilon value for finite difference computations. + + Returns + ------- + None + """ super().__init__( basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon ) def get_evaluation_directions(self, direction_matrix, theta) -> None: - direction_matrix.fill(0.0) - direction_matrix[:, 0] = get_device(theta) + """Get the evaluation directions for the gradient computation. - print(f"theta: {theta}") + Parameters + ---------- + direction_matrix : xp.ndarray + The matrix to store the evaluation directions. + theta : xp.ndarray + The current (hyper)parameter values. + Returns + ------- + None + """ + theta_dev = get_device(theta) + direction_matrix.fill(0.0) + + # First column is the current theta + direction_matrix[:, 0] = theta_dev + # Next basis_size columns are + epsilon * e_i direction_matrix[ - :, 1 : self.basis.shape[0] + 1 + :, 1 : self.basis_size + 1 ] += self.finite_difference_epsilon * self.basis + xp.repeat( - get_device(theta).reshape(-1, 1), self.basis.shape[0], 1 + theta_dev.reshape(-1, 1), self.basis_size, 1 ) + # Next basis_size columns are - epsilon * e_i direction_matrix[ - :, 1 + self.basis.shape[0] : + :, 1 + self.basis_size : ] -= self.finite_difference_epsilon * self.basis - xp.repeat( - get_device(theta).reshape(-1, 1), self.basis.shape[0], 1 + theta_dev.reshape(-1, 1), self.basis_size, 1 ) def compute_gradient(self, function_evaluations, gradient) -> None: + """Compute the gradient using finite differences. + + Parameters + ---------- + function_evaluations : xp.ndarray + The function evaluations at the current and perturbed points. + gradient : xp.ndarray + The array to store the computed gradient. - print(f"IG: function_evaluations: {function_evaluations}") - for i in range(self.basis.shape[0]): + Returns + ------- + None + """ + for i in range(self.basis_size): gradient[i] = ( function_evaluations[i + 1] - - function_evaluations[self.basis.shape[0] + i + 1] + - function_evaluations[self.basis_size + i + 1] ) / (2.0 * self.finite_difference_epsilon) From eaabad6f94dc6d2ff6dc25700bd25b9f1712cb5c Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Thu, 11 Sep 2025 15:14:01 +0200 Subject: [PATCH 5/6] smart_gradient is working --- src/dalia/core/dalia.py | 10 ---------- src/dalia/gradient_methods/smart_gradient.py | 20 +++++++++++--------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py index 083b5025..cc4a6182 100644 --- a/src/dalia/core/dalia.py +++ b/src/dalia/core/dalia.py @@ -587,8 +587,6 @@ def _objective_function( direction_matrix=self.theta_mat, theta=theta_i ) - print(self.theta_mat) - # Proceed to the parallel function evaluation for feval_i in range(self.n_f_evaluations - 1, -1, -1): # Perform the evaluation in reverse order so that the stored and returned @@ -631,14 +629,6 @@ def _objective_function( ) self.iter += 1 - print(f"self.f_values_i: {self.f_values_i}") - print(f"self.gradient_f: {self.gradient_f}") - print(f"f_0: {f_0}") - print(f"grad_f: {grad_f}") - - """ if self.iter == 2: - exit() """ - return (f_0, grad_f) def _evaluate_f( diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py index 71d75f7c..495db6f0 100644 --- a/src/dalia/gradient_methods/smart_gradient.py +++ b/src/dalia/gradient_methods/smart_gradient.py @@ -70,7 +70,7 @@ def _transformed_fun(self, phi) -> xp.ndarray: xp.ndarray The transformed output. """ - return self.curr_theta + self.basis @ phi + return (self.curr_theta + self.basis @ phi).T def _scale(self, x) -> xp.ndarray: """Scale the input vector. @@ -125,7 +125,9 @@ def _orthogonalize_basis(self) -> None: Q, R = xp.linalg.qr(self.basis) self.basis = Q except xp.linalg.LinAlgError: - print("Warning: QR decomposition failed. Resetting G to identity.") + print( + "Warning: QR decomposition failed. Resetting G to identity.", flush=True + ) self.basis = xp.identity(self.basis_size) def get_evaluation_directions(self, direction_matrix, theta) -> None: @@ -158,16 +160,16 @@ def get_evaluation_directions(self, direction_matrix, theta) -> None: # First column is the current theta direction_matrix[:, 0] = theta_dev # Next basis_size columns are + epsilon * e_i - direction_matrix[:, 1 : 1 + self.basis_size] += ( - self.finite_difference_epsilon * self.basis - ) + direction_matrix[ + :, 1 : 1 + self.basis_size + ] += self.finite_difference_epsilon * xp.eye(self.basis_size) # Next basis_size columns are - epsilon * e_i - direction_matrix[:, self.basis_size + 1 :] -= ( - self.finite_difference_epsilon * self.basis - ) + direction_matrix[ + :, self.basis_size + 1 : + ] -= self.finite_difference_epsilon * xp.eye(self.basis_size) for i in range(1, direction_matrix.shape[1]): - direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i]).T + direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i]) def compute_gradient(self, function_evaluations, gradient) -> None: """Compute the gradient using finite differences. From fb45be483aa5d6aaead3012c0b67513af9c5134a Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Thu, 11 Sep 2025 17:28:17 +0200 Subject: [PATCH 6/6] Simplification of smart gradient method --- src/dalia/configs/gradient_method_config.py | 1 - src/dalia/gradient_methods/smart_gradient.py | 26 +++++--------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py index 12d56bf1..a02443cb 100644 --- a/src/dalia/configs/gradient_method_config.py +++ b/src/dalia/configs/gradient_method_config.py @@ -8,7 +8,6 @@ class GradientMethodConfig(BaseModel, ABC): model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) # Input folder for this specific submodel - input_dir: str = None type: Literal["vanilla_gradient", "smart_gradient"] = None finite_difference_epsilon: float = 1e-3 diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py index 495db6f0..71845da6 100644 --- a/src/dalia/gradient_methods/smart_gradient.py +++ b/src/dalia/gradient_methods/smart_gradient.py @@ -57,21 +57,6 @@ def __init__( self.count = 0 self.rng = rand.default_rng() - def _transformed_fun(self, phi) -> xp.ndarray: - """Transform the input using the current theta and basis. - - Parameters - ---------- - phi : xp.ndarray - The input to transform. - - Returns - ------- - xp.ndarray - The transformed output. - """ - return (self.curr_theta + self.basis @ phi).T - def _scale(self, x) -> xp.ndarray: """Scale the input vector. @@ -162,14 +147,15 @@ def get_evaluation_directions(self, direction_matrix, theta) -> None: # Next basis_size columns are + epsilon * e_i direction_matrix[ :, 1 : 1 + self.basis_size - ] += self.finite_difference_epsilon * xp.eye(self.basis_size) + ] += self.finite_difference_epsilon * self.basis + xp.repeat( + theta_dev.reshape(-1, 1), self.basis_size, 1 + ) # Next basis_size columns are - epsilon * e_i direction_matrix[ :, self.basis_size + 1 : - ] -= self.finite_difference_epsilon * xp.eye(self.basis_size) - - for i in range(1, direction_matrix.shape[1]): - direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i]) + ] -= self.finite_difference_epsilon * self.basis - xp.repeat( + theta_dev.reshape(-1, 1), self.basis_size, 1 + ) def compute_gradient(self, function_evaluations, gradient) -> None: """Compute the gradient using finite differences.