diff --git a/src/dalia/configs/dalia_config.py b/src/dalia/configs/dalia_config.py index 0dbbbec0..c8d2a015 100644 --- a/src/dalia/configs/dalia_config.py +++ b/src/dalia/configs/dalia_config.py @@ -6,6 +6,12 @@ from pydantic import BaseModel, ConfigDict, PositiveInt +from dalia.configs.gradient_method_config import ( + GradientMethodConfig, + SmartGradientConfig, + VanillaGradientConfig, +) + class SolverConfig(BaseModel): model_config = ConfigDict(extra="forbid") @@ -20,15 +26,16 @@ class BFGSConfig(BaseModel): max_iter: PositiveInt = 100 jac: bool = True - - maxcor: PositiveInt = 10 # maximum number of past gradient vectors to store -> good default: dim(theta) - maxls: PositiveInt = 20 # maximum number of line search iterations + + maxcor: PositiveInt = ( + 10 # maximum number of past gradient vectors to store -> good default: dim(theta) + ) + maxls: PositiveInt = 20 # maximum number of line search iterations gtol: float = 1e-1 # c1: float = 1e-4 # only relevant for BFGS not for L-BFGS-B # c2: float = 0.9 # only relevant for BFGS not for L-BFGS-B disp: bool = False - class DaliaConfig(BaseModel): @@ -37,11 +44,12 @@ class DaliaConfig(BaseModel): # --- Simulation parameters ------------------------------------------------ solver: SolverConfig = SolverConfig() minimize: BFGSConfig = BFGSConfig() + gradient_method: GradientMethodConfig = SmartGradientConfig() # exit BFGS early if the reduction in the objective function is less than f_reduction_tol after f_reduction_lag iterations f_reduction_lag: int = 3 f_reduction_tol: float = 1e-4 - + # exit BFGS early if the change in theta is less than theta_reduction_tol after theta_reduction_lag iterations theta_reduction_lag: int = 3 theta_reduction_tol: float = 1e-4 diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py new file mode 100644 index 00000000..a02443cb --- /dev/null +++ b/src/dalia/configs/gradient_method_config.py @@ -0,0 +1,26 @@ +from abc import ABC +from typing import Literal + +from pydantic import BaseModel, ConfigDict + + +class GradientMethodConfig(BaseModel, ABC): + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) + + # Input folder for this specific submodel + type: Literal["vanilla_gradient", "smart_gradient"] = None + + finite_difference_epsilon: float = 1e-3 + + +class VanillaGradientConfig(GradientMethodConfig): + type: Literal["vanilla_gradient"] = "vanilla_gradient" + + +class SmartGradientConfig(GradientMethodConfig): + type: Literal["smart_gradient"] = "smart_gradient" + + # The diagonal noise ensure to avoid singularities in the QR decomposition + diagonal_noise: float = 1e-8 + # Threshold below which scaling is not performed + scaling_threshold: float = 1e-12 diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py index 5756d086..cc4a6182 100644 --- a/src/dalia/core/dalia.py +++ b/src/dalia/core/dalia.py @@ -9,6 +9,7 @@ from dalia.configs.dalia_config import DaliaConfig from dalia.core.model import Model from dalia.solvers import DenseSolver, DistSerinvSolver, SerinvSolver, SparseSolver +from dalia.gradient_methods import VanillaGradient, SmartGradient from dalia.utils import ( DummyCommunicator, add_str_header, @@ -190,9 +191,28 @@ def __init__( nccl_comm=self.nccl_comm, ) + # --- Initialize Gradient Method + if self.config.gradient_method.type == "vanilla_gradient": + self.gradient_method = VanillaGradient( + basis_size=self.model.n_hyperparameters, + finite_difference_epsilon=self.config.gradient_method.finite_difference_epsilon, + ) + elif self.config.gradient_method.type == "smart_gradient": + self.gradient_method = SmartGradient( + basis_size=self.model.n_hyperparameters, + finite_difference_epsilon=self.config.gradient_method.finite_difference_epsilon, + diagonal_noise=self.config.gradient_method.diagonal_noise, + scaling_threshold=self.config.gradient_method.scaling_threshold, + ) + else: + raise ValueError( + f"Unknown gradient method type: {self.config.gradient_method.type}" + ) + # --- Set up recurrent variables self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64) self.f_values_i = xp.zeros(self.n_f_evaluations, dtype=xp.float64) + self.gradient_basis = xp.eye(self.model.n_hyperparameters, dtype=xp.float64) self.eps_mat = xp.zeros( (self.model.n_hyperparameters, self.model.n_hyperparameters), dtype=xp.float64, @@ -336,7 +356,7 @@ def run(self) -> dict: } synchronize(comm=self.comm_world) toc = time.perf_counter() - print_msg(f"DALIA inference took: {toc - tic:0.4f} (s)", flush = True) + print_msg(f"DALIA inference took: {toc - tic:0.4f} (s)", flush=True) return results def minimize(self) -> optimize.OptimizeResult: @@ -563,15 +583,9 @@ def _objective_function( for i in range(self.n_f_evaluations): task_mapping.append(i % n_feval_comm) - # Initialize central difference scheme matrix - self.eps_mat[:] = self.eps_gradient_f * xp.eye(self.model.n_hyperparameters) - self.theta_mat[:] = xp.repeat( - get_device(theta_i).reshape(-1, 1), self.n_f_evaluations, axis=1 + self.gradient_method.get_evaluation_directions( + direction_matrix=self.theta_mat, theta=theta_i ) - self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat - self.theta_mat[ - :, self.model.n_hyperparameters + 1 : self.n_f_evaluations - ] -= self.eps_mat # Proceed to the parallel function evaluation for feval_i in range(self.n_f_evaluations - 1, -1, -1): @@ -593,11 +607,9 @@ def _objective_function( synchronize(comm=self.comm_world) # Compute gradient using central difference scheme - for i in range(self.model.n_hyperparameters): - self.gradient_f[i] = ( - self.f_values_i[i + 1] - - self.f_values_i[self.model.n_hyperparameters + i + 1] - ) / (2 * self.eps_gradient_f) + self.gradient_method.compute_gradient( + function_evaluations=self.f_values_i, gradient=self.gradient_f + ) f_0 = get_host(self.f_values_i[0]) grad_f = get_host(self.gradient_f) @@ -794,11 +806,15 @@ def compute_covariance_hp(self, theta_i: NDArray) -> NDArray: # flush=True, # ) cov_theta = xp.linalg.inv(hess_theta) - synchronize(comm=self.comm_world) + synchronize(comm=self.comm_world) toc = time.perf_counter() t_covariance_hp = toc - tic - print_msg("Time to compute covariance of hyperparameters:", t_covariance_hp, flush=True) - + print_msg( + "Time to compute covariance of hyperparameters:", + t_covariance_hp, + flush=True, + ) + return cov_theta def _evaluate_hessian_f( diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py new file mode 100644 index 00000000..3bf30bf1 --- /dev/null +++ b/src/dalia/core/gradient_method.py @@ -0,0 +1,59 @@ +from abc import ABC, abstractmethod + +from dalia import xp + + +class GradientMethod(ABC): + """Core class for gradient computation methods.""" + + def __init__(self, basis_size, finite_difference_epsilon) -> None: + """Initialize the gradient computation method. + + Parameters + ---------- + basis_size : int + The size of the basis for finite differences. + finite_difference_epsilon : float + The epsilon value for finite difference computations. + + Returns + ------- + None + """ + self.basis_size = basis_size + self.basis = xp.identity(self.basis_size, dtype=xp.float64) + self.finite_difference_epsilon = finite_difference_epsilon + + @abstractmethod + def get_evaluation_directions(self, direction_matrix, theta) -> None: + """Get the evaluation directions for the gradient computation. + + Parameters + ---------- + direction_matrix : xp.ndarray + The matrix to store the evaluation directions. + theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ + ... + + @abstractmethod + def compute_gradient(self, function_evaluations, gradient) -> None: + """Compute the gradient using finite differences. + + Parameters + ---------- + function_evaluations : xp.ndarray + The function evaluations at the current and perturbed points. + gradient : xp.ndarray + The array to store the computed gradient. + + Returns + ------- + None + """ + ... diff --git a/src/dalia/gradient_methods/__init__.py b/src/dalia/gradient_methods/__init__.py new file mode 100644 index 00000000..4e807656 --- /dev/null +++ b/src/dalia/gradient_methods/__init__.py @@ -0,0 +1,6 @@ +# Copyright 2024-2025 DALIA authors. All rights reserved. + +from dalia.gradient_methods.vanilla_gradient import VanillaGradient +from dalia.gradient_methods.smart_gradient import SmartGradient + +__all__ = ["VanillaGradient", "SmartGradient"] diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py new file mode 100644 index 00000000..71845da6 --- /dev/null +++ b/src/dalia/gradient_methods/smart_gradient.py @@ -0,0 +1,181 @@ +from numpy import random as rand + +from dalia import xp + +from dalia.core.gradient_method import GradientMethod +from dalia.utils import get_device + + +class SmartGradient(GradientMethod): + """Smart gradient computation method. + + This method adaptively updates the basis used for finite difference + gradient estimation based on the changes in the (hyper)parameter values. + It employs QR decomposition to maintain an orthogonal basis and scales + the updates to ensure numerical stability. + + References + ---------- + .. [1] Esmail Abdul Fattah, Janet Van Niekerk, HÃ¥vard Rue. + Smart Gradient - An adaptive technique for improving + gradient estimation. Foundations of Data Science, 2022, + 4(1): 123-136. doi: 10.3934/fods.2021037 + """ + + def __init__( + self, + basis_size, + finite_difference_epsilon, + diagonal_noise=1e-8, + scaling_threshold=1e-12, + ) -> None: + """Initialize the gradient computation method. + + Parameters + ---------- + basis_size : int + The size of the basis for finite differences. + finite_difference_epsilon : float + The epsilon value for finite difference computations. + diagonal_noise : float + The diagonal noise to avoid singularities in the QR decomposition. + scaling_threshold : float + The threshold below which scaling is not performed. + + Returns + ------- + None + """ + super().__init__( + basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon + ) + self.diagonal_noise = diagonal_noise + self.scaling_threshold = scaling_threshold + self.temp_basis = xp.identity(self.basis_size) + self.prev_theta = xp.zeros(self.basis_size) + self.curr_theta = xp.zeros(self.basis_size) + self.count = 0 + self.rng = rand.default_rng() + + def _scale(self, x) -> xp.ndarray: + """Scale the input vector. + + Parameters + ---------- + x : xp.ndarray + The input vector to scale. + + Returns + ------- + xp.ndarray + The scaled output vector. + """ + mean = xp.mean(x) + std = xp.std(x, ddof=1) + if std < self.scaling_threshold: + return x - mean + return (x - mean) / std + + def _update_basis(self, current_theta) -> None: + """Update the basis using the current theta. + + Parameters + ---------- + current_theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ + self.curr_theta = current_theta + self.temp_basis = xp.roll(self.temp_basis, 1, axis=1) + xdiff = current_theta - self.prev_theta + xdiff += get_device(self.rng.normal(0.0, self.diagonal_noise, self.basis_size)) + self.temp_basis[:, 0] = self._scale(xdiff) + self.basis = self.temp_basis + + def _orthogonalize_basis(self) -> None: + """Orthogonalize the basis using QR decomposition. + + Parameters + ---------- + None + + Returns + ------- + None + """ + try: + Q, R = xp.linalg.qr(self.basis) + self.basis = Q + except xp.linalg.LinAlgError: + print( + "Warning: QR decomposition failed. Resetting G to identity.", flush=True + ) + self.basis = xp.identity(self.basis_size) + + def get_evaluation_directions(self, direction_matrix, theta) -> None: + """Get the evaluation directions for the gradient computation. + + Parameters + ---------- + direction_matrix : xp.ndarray + The matrix to store the evaluation directions. + theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ + theta_dev = get_device(theta) + direction_matrix.fill(0.0) + + if self.count > 0: + self._update_basis(theta_dev) + else: + self.curr_theta = theta_dev # Set the starting point + + self._orthogonalize_basis() + + self.prev_theta = xp.copy(theta_dev) + self.count += 1 + + # First column is the current theta + direction_matrix[:, 0] = theta_dev + # Next basis_size columns are + epsilon * e_i + direction_matrix[ + :, 1 : 1 + self.basis_size + ] += self.finite_difference_epsilon * self.basis + xp.repeat( + theta_dev.reshape(-1, 1), self.basis_size, 1 + ) + # Next basis_size columns are - epsilon * e_i + direction_matrix[ + :, self.basis_size + 1 : + ] -= self.finite_difference_epsilon * self.basis - xp.repeat( + theta_dev.reshape(-1, 1), self.basis_size, 1 + ) + + def compute_gradient(self, function_evaluations, gradient) -> None: + """Compute the gradient using finite differences. + + Parameters + ---------- + function_evaluations : xp.ndarray + The function evaluations at the current and perturbed points. + gradient : xp.ndarray + The array to store the computed gradient. + + Returns + ------- + None + """ + for i in range(self.basis_size): + gradient[i] = ( + function_evaluations[i + 1] + - function_evaluations[self.basis_size + i + 1] + ) / (2.0 * self.finite_difference_epsilon) + + # Transform back the gradient into the original basis + gradient[:] = xp.linalg.solve(self.basis.T, gradient) diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py new file mode 100644 index 00000000..20bdabde --- /dev/null +++ b/src/dalia/gradient_methods/vanilla_gradient.py @@ -0,0 +1,78 @@ +from dalia.core.gradient_method import GradientMethod +from dalia import xp + +from dalia.utils import get_device + + +class VanillaGradient(GradientMethod): + """Vanilla finite difference gradient computation method.""" + + def __init__(self, basis_size, finite_difference_epsilon) -> None: + """Initialize the gradient computation method. + + Parameters + ---------- + basis_size : int + The size of the basis for finite differences. + finite_difference_epsilon : float + The epsilon value for finite difference computations. + + Returns + ------- + None + """ + super().__init__( + basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon + ) + + def get_evaluation_directions(self, direction_matrix, theta) -> None: + """Get the evaluation directions for the gradient computation. + + Parameters + ---------- + direction_matrix : xp.ndarray + The matrix to store the evaluation directions. + theta : xp.ndarray + The current (hyper)parameter values. + + Returns + ------- + None + """ + theta_dev = get_device(theta) + direction_matrix.fill(0.0) + + # First column is the current theta + direction_matrix[:, 0] = theta_dev + # Next basis_size columns are + epsilon * e_i + direction_matrix[ + :, 1 : self.basis_size + 1 + ] += self.finite_difference_epsilon * self.basis + xp.repeat( + theta_dev.reshape(-1, 1), self.basis_size, 1 + ) + # Next basis_size columns are - epsilon * e_i + direction_matrix[ + :, 1 + self.basis_size : + ] -= self.finite_difference_epsilon * self.basis - xp.repeat( + theta_dev.reshape(-1, 1), self.basis_size, 1 + ) + + def compute_gradient(self, function_evaluations, gradient) -> None: + """Compute the gradient using finite differences. + + Parameters + ---------- + function_evaluations : xp.ndarray + The function evaluations at the current and perturbed points. + gradient : xp.ndarray + The array to store the computed gradient. + + Returns + ------- + None + """ + for i in range(self.basis_size): + gradient[i] = ( + function_evaluations[i + 1] + - function_evaluations[self.basis_size + i + 1] + ) / (2.0 * self.finite_difference_epsilon)