From 3336dad0a0e50e191a6aef1854bf72297718a3bd Mon Sep 17 00:00:00 2001
From: vincent-maillou <vmaillou@iis.ee.ethz.ch>
Date: Tue, 9 Sep 2025 17:04:31 +0200
Subject: [PATCH 1/6] Initial implementation of SmartGradient Co-authored-by:
 esmail-abdulfattah <esmail.abdulfattah@kaust.edu.sa>

---
 src/dalia/core/dalia.py | 108 ++++++++++++++++++++++++++++++++++------
 1 file changed, 93 insertions(+), 15 deletions(-)

diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py
index 5756d086..6b00303c 100644
--- a/src/dalia/core/dalia.py
+++ b/src/dalia/core/dalia.py
@@ -5,6 +5,8 @@
 from scipy import optimize
 from tabulate import tabulate
 
+import numpy as np
+
 from dalia import ArrayLike, NDArray, backend_flags, comm_rank, comm_size, sp, xp
 from dalia.configs.dalia_config import DaliaConfig
 from dalia.core.model import Model
@@ -193,6 +195,7 @@ def __init__(
         # --- Set up recurrent variables
         self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64)
         self.f_values_i = xp.zeros(self.n_f_evaluations, dtype=xp.float64)
+        self.gradient_basis = xp.eye(self.model.n_hyperparameters, dtype=xp.float64)
         self.eps_mat = xp.zeros(
             (self.model.n_hyperparameters, self.model.n_hyperparameters),
             dtype=xp.float64,
@@ -220,6 +223,22 @@ def __init__(
         logging.info("DALIA initialized.")
         print_msg("DALIA initialized.", flush=True)
 
+
+
+
+        # SMART GRADIENT STUFF
+        # self.fd_step = xp.cbrt(xp.finfo(float).eps)
+        self.fd_step = 1e-3
+       
+        self.noise_stddev = 7e-8 # To avoid singularity during QR
+       
+        self.G = xp.identity(self.model.n_hyperparameters)
+        self.c_G = xp.identity(self.model.n_hyperparameters)
+        self.prev_theta = xp.zeros(self.model.n_hyperparameters)
+        self.curr_theta = xp.zeros(self.model.n_hyperparameters)
+        self.count = 0
+        self.rng = np.random.default_rng()
+
     def _print_init(self) -> None:
         """
         Print informations about the DALIA solver.
@@ -529,6 +548,75 @@ def callback(intermediate_result: optimize.OptimizeResult):
 
         return self.minimization_result
 
+    def _transformed_fun(self, phi):
+        return self.curr_theta + self.G @ phi
+
+    def _scale(self, x):
+        mean = xp.mean(x)
+        std = xp.std(x, ddof=1)
+        if std < 1e-12:
+            return x - mean
+        return (x - mean) / std
+
+    def _update_G(self, current_theta):
+        self.curr_theta = current_theta
+        self.c_G = xp.roll(self.c_G, 1, axis=1)
+        xdiff = current_theta - self.prev_theta
+        xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters))
+        self.c_G[:, 0] = self._scale(xdiff)
+        self.G = self.c_G
+
+    def _orthogonalize_G(self):
+        try:
+            Q, R = xp.linalg.qr(self.G)
+            self.G = Q
+        except xp.linalg.LinAlgError:
+            print("Warning: QR decomposition failed. Resetting G to identity.")
+            self.G = xp.identity(self.model.n_hyperparameters)
+
+    def _get_original_grad(self, transformed_grad):
+        return xp.linalg.solve(self.G.T, transformed_grad)
+
+    def _update_gradient_basis(self) -> None:
+        for i in range(self.model.n_hyperparameters):
+            self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2]
+        
+    def _construct_f_evaluation_points(self, theta_i: NDArray) -> None:
+        if self.count > 0:
+            self._update_G(theta_i)
+        else:
+            self.curr_theta = theta_i  # Set the starting point
+
+        self._orthogonalize_G()
+
+        self.prev_theta = xp.copy(theta_i)
+        self.count += 1
+
+        # Initialize central difference scheme matrix
+        self.eps_mat[:] = self.fd_step * self.gradient_basis
+        self.theta_mat[:] = xp.zeros(
+            (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64
+        )
+
+        self.curr_theta.T
+
+        self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T)
+
+        self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat
+        self.theta_mat[
+            :, self.model.n_hyperparameters + 1 : self.n_f_evaluations
+        ] -= self.eps_mat
+
+        for i in range(1, self.n_f_evaluations):
+            self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T
+
+    def _compute_gradient(self):
+        for i in range(self.model.n_hyperparameters):
+            self.gradient_f[i] = (
+                self.f_values_i[i + 1]
+                - self.f_values_i[self.model.n_hyperparameters + i + 1]
+            ) / (2 * self.fd_step)
+
     def _objective_function(
         self,
         theta_i: NDArray,
@@ -563,15 +651,9 @@ def _objective_function(
         for i in range(self.n_f_evaluations):
             task_mapping.append(i % n_feval_comm)
 
-        # Initialize central difference scheme matrix
-        self.eps_mat[:] = self.eps_gradient_f * xp.eye(self.model.n_hyperparameters)
-        self.theta_mat[:] = xp.repeat(
-            get_device(theta_i).reshape(-1, 1), self.n_f_evaluations, axis=1
-        )
-        self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat
-        self.theta_mat[
-            :, self.model.n_hyperparameters + 1 : self.n_f_evaluations
-        ] -= self.eps_mat
+        self._construct_f_evaluation_points(theta_i=get_device(theta_i))
+
+        
 
         # Proceed to the parallel function evaluation
         for feval_i in range(self.n_f_evaluations - 1, -1, -1):
@@ -593,14 +675,10 @@ def _objective_function(
         synchronize(comm=self.comm_world)
 
         # Compute gradient using central difference scheme
-        for i in range(self.model.n_hyperparameters):
-            self.gradient_f[i] = (
-                self.f_values_i[i + 1]
-                - self.f_values_i[self.model.n_hyperparameters + i + 1]
-            ) / (2 * self.eps_gradient_f)
+        self._compute_gradient()
 
         f_0 = get_host(self.f_values_i[0])
-        grad_f = get_host(self.gradient_f)
+        grad_f = get_host(self._get_original_grad(self.gradient_f))
 
         synchronize(comm=self.comm_world)
         toc = time.perf_counter()

From 537df2ecb6a81bd4b7597d6790600cd70e48cc36 Mon Sep 17 00:00:00 2001
From: vincent-maillou <vmaillou@iis.ee.ethz.ch>
Date: Wed, 10 Sep 2025 11:40:13 +0200
Subject: [PATCH 2/6] draft of clean implementation of GradientMethod

---
 src/dalia/configs/dalia_config.py             |   1 -
 src/dalia/configs/gradient_method_config.py   |  22 ++++
 src/dalia/core/dalia.py                       |  95 ++-------------
 src/dalia/core/gradient_method.py             |  19 +++
 src/dalia/gradient_methods/__init__.py        |   6 +
 src/dalia/gradient_methods/smart_gradient.py  | 111 ++++++++++++++++++
 .../gradient_methods/vanilla_gradient.py      |  22 ++++
 7 files changed, 188 insertions(+), 88 deletions(-)
 create mode 100644 src/dalia/configs/gradient_method_config.py
 create mode 100644 src/dalia/core/gradient_method.py
 create mode 100644 src/dalia/gradient_methods/__init__.py
 create mode 100644 src/dalia/gradient_methods/smart_gradient.py
 create mode 100644 src/dalia/gradient_methods/vanilla_gradient.py

diff --git a/src/dalia/configs/dalia_config.py b/src/dalia/configs/dalia_config.py
index 0dbbbec0..b37dd736 100644
--- a/src/dalia/configs/dalia_config.py
+++ b/src/dalia/configs/dalia_config.py
@@ -28,7 +28,6 @@ class BFGSConfig(BaseModel):
     # c1: float = 1e-4  # only relevant for BFGS not for L-BFGS-B
     # c2: float = 0.9  # only relevant for BFGS not for L-BFGS-B
     disp: bool = False
-    
 
 
 class DaliaConfig(BaseModel):
diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py
new file mode 100644
index 00000000..9782e342
--- /dev/null
+++ b/src/dalia/configs/gradient_method_config.py
@@ -0,0 +1,22 @@
+from abc import ABC
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+class GradientMethodConfig(BaseModel, ABC):
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+
+    # Input folder for this specific submodel
+    input_dir: str = None
+    type: Literal["vanilla_gradient", "smart_gradient"] = "smart_gradient"
+
+    finite_difference_epsilon: float = 1e-3
+
+class VanillaGradientConfig(GradientMethodConfig): ...
+
+class SmartGradientConfig(GradientMethodConfig):
+    # The diagonal noise ensure to avoid singularities in the QR decomposition
+    diagonal_noise: float = 1e-8
+    # Threshold below which scaling is not performed
+    scaling_threashold: float = 1e-12
\ No newline at end of file
diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py
index 6b00303c..2ac41088 100644
--- a/src/dalia/core/dalia.py
+++ b/src/dalia/core/dalia.py
@@ -11,6 +11,7 @@
 from dalia.configs.dalia_config import DaliaConfig
 from dalia.core.model import Model
 from dalia.solvers import DenseSolver, DistSerinvSolver, SerinvSolver, SparseSolver
+from dalia.gradient_methods import VanillaGradient
 from dalia.utils import (
     DummyCommunicator,
     add_str_header,
@@ -192,6 +193,9 @@ def __init__(
                     nccl_comm=self.nccl_comm,
                 )
 
+        # --- Initialize Gradient Method
+        self.gradient_method = VanillaGradient(basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3)
+
         # --- Set up recurrent variables
         self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64)
         self.f_values_i = xp.zeros(self.n_f_evaluations, dtype=xp.float64)
@@ -223,22 +227,6 @@ def __init__(
         logging.info("DALIA initialized.")
         print_msg("DALIA initialized.", flush=True)
 
-
-
-
-        # SMART GRADIENT STUFF
-        # self.fd_step = xp.cbrt(xp.finfo(float).eps)
-        self.fd_step = 1e-3
-       
-        self.noise_stddev = 7e-8 # To avoid singularity during QR
-       
-        self.G = xp.identity(self.model.n_hyperparameters)
-        self.c_G = xp.identity(self.model.n_hyperparameters)
-        self.prev_theta = xp.zeros(self.model.n_hyperparameters)
-        self.curr_theta = xp.zeros(self.model.n_hyperparameters)
-        self.count = 0
-        self.rng = np.random.default_rng()
-
     def _print_init(self) -> None:
         """
         Print informations about the DALIA solver.
@@ -548,75 +536,6 @@ def callback(intermediate_result: optimize.OptimizeResult):
 
         return self.minimization_result
 
-    def _transformed_fun(self, phi):
-        return self.curr_theta + self.G @ phi
-
-    def _scale(self, x):
-        mean = xp.mean(x)
-        std = xp.std(x, ddof=1)
-        if std < 1e-12:
-            return x - mean
-        return (x - mean) / std
-
-    def _update_G(self, current_theta):
-        self.curr_theta = current_theta
-        self.c_G = xp.roll(self.c_G, 1, axis=1)
-        xdiff = current_theta - self.prev_theta
-        xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters))
-        self.c_G[:, 0] = self._scale(xdiff)
-        self.G = self.c_G
-
-    def _orthogonalize_G(self):
-        try:
-            Q, R = xp.linalg.qr(self.G)
-            self.G = Q
-        except xp.linalg.LinAlgError:
-            print("Warning: QR decomposition failed. Resetting G to identity.")
-            self.G = xp.identity(self.model.n_hyperparameters)
-
-    def _get_original_grad(self, transformed_grad):
-        return xp.linalg.solve(self.G.T, transformed_grad)
-
-    def _update_gradient_basis(self) -> None:
-        for i in range(self.model.n_hyperparameters):
-            self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2]
-        
-    def _construct_f_evaluation_points(self, theta_i: NDArray) -> None:
-        if self.count > 0:
-            self._update_G(theta_i)
-        else:
-            self.curr_theta = theta_i  # Set the starting point
-
-        self._orthogonalize_G()
-
-        self.prev_theta = xp.copy(theta_i)
-        self.count += 1
-
-        # Initialize central difference scheme matrix
-        self.eps_mat[:] = self.fd_step * self.gradient_basis
-        self.theta_mat[:] = xp.zeros(
-            (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64
-        )
-
-        self.curr_theta.T
-
-        self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T)
-
-        self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat
-        self.theta_mat[
-            :, self.model.n_hyperparameters + 1 : self.n_f_evaluations
-        ] -= self.eps_mat
-
-        for i in range(1, self.n_f_evaluations):
-            self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T
-
-    def _compute_gradient(self):
-        for i in range(self.model.n_hyperparameters):
-            self.gradient_f[i] = (
-                self.f_values_i[i + 1]
-                - self.f_values_i[self.model.n_hyperparameters + i + 1]
-            ) / (2 * self.fd_step)
-
     def _objective_function(
         self,
         theta_i: NDArray,
@@ -651,9 +570,11 @@ def _objective_function(
         for i in range(self.n_f_evaluations):
             task_mapping.append(i % n_feval_comm)
 
-        self._construct_f_evaluation_points(theta_i=get_device(theta_i))
+        self.gradient_method.get_evaluation_directions(direction_matrix=self.theta_mat)
 
-        
+        print(self.theta_mat)
+
+        exit()
 
         # Proceed to the parallel function evaluation
         for feval_i in range(self.n_f_evaluations - 1, -1, -1):
diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py
new file mode 100644
index 00000000..f795140e
--- /dev/null
+++ b/src/dalia/core/gradient_method.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+
+from dalia import xp
+
+
+class GradientMethod(ABC):
+    """Core class for gradient computation methods."""
+
+    def __init__(self, basis_size, finite_difference_epsilon):
+        self.basis = xp.identity((basis_size, basis_size), dtype=xp.float64)
+        self.finite_difference_epsilon = finite_difference_epsilon
+
+    @abstractmethod
+    def get_evaluation_directions(self, direction_matrix) -> None:
+        ...
+
+    @abstractmethod
+    def compute_gradient(self, gradient) -> None:
+        ...
\ No newline at end of file
diff --git a/src/dalia/gradient_methods/__init__.py b/src/dalia/gradient_methods/__init__.py
new file mode 100644
index 00000000..95487c21
--- /dev/null
+++ b/src/dalia/gradient_methods/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2024-2025 DALIA authors. All rights reserved.
+
+from dalia.gradient_methods.vanilla_gradient import VanillaGradient
+# from dalia.gradient_methods.smart_gradient import SmartGradient
+
+__all__ = ["VanillaGradient"]
diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py
new file mode 100644
index 00000000..0c82af65
--- /dev/null
+++ b/src/dalia/gradient_methods/smart_gradient.py
@@ -0,0 +1,111 @@
+from dalia.core.gradient_method import GradientMethod
+from dalia import xp
+
+
+class SmartGradient(GradientMethod):
+    """Smart gradient computation method.
+    
+    
+    References
+    ----------
+    .. [1] Esmail Abdul Fattah, Janet Van Niekerk, Håvard Rue. 
+        Smart Gradient - An adaptive technique for improving 
+        gradient estimation. Foundations of Data Science, 2022, 
+        4(1): 123-136. doi: 10.3934/fods.2021037
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        # SMART GRADIENT STUFF
+        # self.fd_step = xp.cbrt(xp.finfo(float).eps)
+        self.fd_step = 1e-3
+       
+        self.noise_stddev = 7e-8 # To avoid singularity during QR
+       
+        self.G = xp.identity(self.model.n_hyperparameters)
+        self.c_G = xp.identity(self.model.n_hyperparameters)
+        self.prev_theta = xp.zeros(self.model.n_hyperparameters)
+        self.curr_theta = xp.zeros(self.model.n_hyperparameters)
+        self.count = 0
+        self.rng = np.random.default_rng()
+
+    def get_evaluation_directions(self):
+        ...
+        # theta_mat is called gradient_direction_for_finite_difference in INLA
+        # Could be gradient_directions_matrix
+
+    def compute_gradient(self):
+        ...
+
+
+
+    def _transformed_fun(self, phi):
+        return self.curr_theta + self.G @ phi
+
+    def _scale(self, x):
+        mean = xp.mean(x)
+        std = xp.std(x, ddof=1)
+        if std < 1e-12:
+            return x - mean
+        return (x - mean) / std
+
+    def _update_G(self, current_theta):
+        self.curr_theta = current_theta
+        self.c_G = xp.roll(self.c_G, 1, axis=1)
+        xdiff = current_theta - self.prev_theta
+        xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters))
+        self.c_G[:, 0] = self._scale(xdiff)
+        self.G = self.c_G
+
+    def _orthogonalize_G(self):
+        try:
+            Q, R = xp.linalg.qr(self.G)
+            self.G = Q
+        except xp.linalg.LinAlgError:
+            print("Warning: QR decomposition failed. Resetting G to identity.")
+            self.G = xp.identity(self.model.n_hyperparameters)
+
+    def _get_original_grad(self, transformed_grad):
+        return xp.linalg.solve(self.G.T, transformed_grad)
+
+    def _update_gradient_basis(self) -> None:
+        for i in range(self.model.n_hyperparameters):
+            self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2]
+        
+    
+    def _construct_f_evaluation_points(self, theta_i: NDArray) -> None:
+        if self.count > 0:
+            self._update_G(theta_i)
+        else:
+            self.curr_theta = theta_i  # Set the starting point
+
+        self._orthogonalize_G()
+
+        self.prev_theta = xp.copy(theta_i)
+        self.count += 1
+
+        # Initialize central difference scheme matrix
+        self.eps_mat[:] = self.fd_step * self.gradient_basis
+        self.theta_mat[:] = xp.zeros(
+            (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64
+        )
+
+        self.curr_theta.T
+
+        self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T)
+
+        self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat
+        self.theta_mat[
+            :, self.model.n_hyperparameters + 1 : self.n_f_evaluations
+        ] -= self.eps_mat
+
+        for i in range(1, self.n_f_evaluations):
+            self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T
+
+    def _compute_gradient(self):
+        for i in range(self.model.n_hyperparameters):
+            self.gradient_f[i] = (
+                self.f_values_i[i + 1]
+                - self.f_values_i[self.model.n_hyperparameters + i + 1]
+            ) / (2 * self.fd_step)
diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py
new file mode 100644
index 00000000..986b9978
--- /dev/null
+++ b/src/dalia/gradient_methods/vanilla_gradient.py
@@ -0,0 +1,22 @@
+from dalia.core.gradient_method import GradientMethod
+from dalia import xp
+
+
+
+class VanillaGradient(GradientMethod):
+    """Vanilla finite difference gradient computation method."""
+
+    def __init__(self, basis_size, finite_difference_epsilon):
+        super().__init__(basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon)
+
+    def get_evaluation_directions(self, direction_matrix) -> None:
+        direction_matrix[:, 1 : self.basis.shape[0] + 1] += self.finite_difference_epsilon * self.basis
+        direction_matrix[:, 1 + self.basis.shape[0]:] -= self.finite_difference_epsilon * self.basis
+
+        print(direction_matrix)
+
+    def compute_gradient(self, gradient) -> None:
+        for i in range(self.model.n_hyperparameters):
+            gradient[i] = (self.f_evaluations[2 * i + 1] - self.f_evaluations[2 * i]) / (
+                2.0 * self.h
+            )
\ No newline at end of file

From 7fb2fd5a2cf0ed70605bb3857957503354b31c6a Mon Sep 17 00:00:00 2001
From: vincent-maillou <vmaillou@iis.ee.ethz.ch>
Date: Thu, 11 Sep 2025 11:40:51 +0200
Subject: [PATCH 3/6] separated gradient_method from core DALIA class

---
 src/dalia/core/dalia.py                       | 38 +++++++++++++------
 src/dalia/core/gradient_method.py             |  8 ++--
 .../gradient_methods/vanilla_gradient.py      | 37 +++++++++++++-----
 3 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py
index 2ac41088..b19a1088 100644
--- a/src/dalia/core/dalia.py
+++ b/src/dalia/core/dalia.py
@@ -5,8 +5,6 @@
 from scipy import optimize
 from tabulate import tabulate
 
-import numpy as np
-
 from dalia import ArrayLike, NDArray, backend_flags, comm_rank, comm_size, sp, xp
 from dalia.configs.dalia_config import DaliaConfig
 from dalia.core.model import Model
@@ -194,7 +192,9 @@ def __init__(
                 )
 
         # --- Initialize Gradient Method
-        self.gradient_method = VanillaGradient(basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3)
+        self.gradient_method = VanillaGradient(
+            basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3
+        )
 
         # --- Set up recurrent variables
         self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64)
@@ -343,7 +343,7 @@ def run(self) -> dict:
         }
         synchronize(comm=self.comm_world)
         toc = time.perf_counter()
-        print_msg(f"DALIA inference took: {toc - tic:0.4f} (s)", flush = True)
+        print_msg(f"DALIA inference took: {toc - tic:0.4f} (s)", flush=True)
         return results
 
     def minimize(self) -> optimize.OptimizeResult:
@@ -570,12 +570,12 @@ def _objective_function(
         for i in range(self.n_f_evaluations):
             task_mapping.append(i % n_feval_comm)
 
-        self.gradient_method.get_evaluation_directions(direction_matrix=self.theta_mat)
+        self.gradient_method.get_evaluation_directions(
+            direction_matrix=self.theta_mat, theta=theta_i
+        )
 
         print(self.theta_mat)
 
-        exit()
-
         # Proceed to the parallel function evaluation
         for feval_i in range(self.n_f_evaluations - 1, -1, -1):
             # Perform the evaluation in reverse order so that the stored and returned
@@ -596,10 +596,12 @@ def _objective_function(
         synchronize(comm=self.comm_world)
 
         # Compute gradient using central difference scheme
-        self._compute_gradient()
+        self.gradient_method.compute_gradient(
+            function_evaluations=self.f_values_i, gradient=self.gradient_f
+        )
 
         f_0 = get_host(self.f_values_i[0])
-        grad_f = get_host(self._get_original_grad(self.gradient_f))
+        grad_f = get_host(self.gradient_f)
 
         synchronize(comm=self.comm_world)
         toc = time.perf_counter()
@@ -616,6 +618,14 @@ def _objective_function(
             )
         self.iter += 1
 
+        print(f"self.f_values_i: {self.f_values_i}")
+        print(f"self.gradient_f: {self.gradient_f}")
+        print(f"f_0: {f_0}")
+        print(f"grad_f: {grad_f}")
+
+        """ if self.iter == 2:
+            exit() """
+
         return (f_0, grad_f)
 
     def _evaluate_f(
@@ -793,11 +803,15 @@ def compute_covariance_hp(self, theta_i: NDArray) -> NDArray:
         #     flush=True,
         # )
         cov_theta = xp.linalg.inv(hess_theta)
-        synchronize(comm=self.comm_world)     
+        synchronize(comm=self.comm_world)
         toc = time.perf_counter()
         t_covariance_hp = toc - tic
-        print_msg("Time to compute covariance of hyperparameters:", t_covariance_hp, flush=True)
-        
+        print_msg(
+            "Time to compute covariance of hyperparameters:",
+            t_covariance_hp,
+            flush=True,
+        )
+
         return cov_theta
 
     def _evaluate_hessian_f(
diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py
index f795140e..2deadc18 100644
--- a/src/dalia/core/gradient_method.py
+++ b/src/dalia/core/gradient_method.py
@@ -7,13 +7,11 @@ class GradientMethod(ABC):
     """Core class for gradient computation methods."""
 
     def __init__(self, basis_size, finite_difference_epsilon):
-        self.basis = xp.identity((basis_size, basis_size), dtype=xp.float64)
+        self.basis = xp.identity(basis_size, dtype=xp.float64)
         self.finite_difference_epsilon = finite_difference_epsilon
 
     @abstractmethod
-    def get_evaluation_directions(self, direction_matrix) -> None:
-        ...
+    def get_evaluation_directions(self, direction_matrix, theta) -> None: ...
 
     @abstractmethod
-    def compute_gradient(self, gradient) -> None:
-        ...
\ No newline at end of file
+    def compute_gradient(self, function_evaluations, gradient) -> None: ...
diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py
index 986b9978..a6f66dba 100644
--- a/src/dalia/gradient_methods/vanilla_gradient.py
+++ b/src/dalia/gradient_methods/vanilla_gradient.py
@@ -1,22 +1,39 @@
 from dalia.core.gradient_method import GradientMethod
 from dalia import xp
 
+from dalia.utils import get_device
 
 
 class VanillaGradient(GradientMethod):
     """Vanilla finite difference gradient computation method."""
 
     def __init__(self, basis_size, finite_difference_epsilon):
-        super().__init__(basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon)
+        super().__init__(
+            basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon
+        )
 
-    def get_evaluation_directions(self, direction_matrix) -> None:
-        direction_matrix[:, 1 : self.basis.shape[0] + 1] += self.finite_difference_epsilon * self.basis
-        direction_matrix[:, 1 + self.basis.shape[0]:] -= self.finite_difference_epsilon * self.basis
+    def get_evaluation_directions(self, direction_matrix, theta) -> None:
+        direction_matrix.fill(0.0)
+        direction_matrix[:, 0] = get_device(theta)
 
-        print(direction_matrix)
+        print(f"theta: {theta}")
 
-    def compute_gradient(self, gradient) -> None:
-        for i in range(self.model.n_hyperparameters):
-            gradient[i] = (self.f_evaluations[2 * i + 1] - self.f_evaluations[2 * i]) / (
-                2.0 * self.h
-            )
\ No newline at end of file
+        direction_matrix[
+            :, 1 : self.basis.shape[0] + 1
+        ] += self.finite_difference_epsilon * self.basis + xp.repeat(
+            get_device(theta).reshape(-1, 1), self.basis.shape[0], 1
+        )
+        direction_matrix[
+            :, 1 + self.basis.shape[0] :
+        ] -= self.finite_difference_epsilon * self.basis - xp.repeat(
+            get_device(theta).reshape(-1, 1), self.basis.shape[0], 1
+        )
+
+    def compute_gradient(self, function_evaluations, gradient) -> None:
+
+        print(f"IG: function_evaluations: {function_evaluations}")
+        for i in range(self.basis.shape[0]):
+            gradient[i] = (
+                function_evaluations[i + 1]
+                - function_evaluations[self.basis.shape[0] + i + 1]
+            ) / (2.0 * self.finite_difference_epsilon)

From c38327f2021a228cd357edc781901e9246988be6 Mon Sep 17 00:00:00 2001
From: vincent-maillou <vmaillou@iis.ee.ethz.ch>
Date: Thu, 11 Sep 2025 12:30:00 +0200
Subject: [PATCH 4/6] added config and structure for smart_gradient however
 smart_gradient needs to be debugged

---
 src/dalia/configs/dalia_config.py             |  17 +-
 src/dalia/configs/gradient_method_config.py   |  11 +-
 src/dalia/core/dalia.py                       |  21 +-
 src/dalia/core/gradient_method.py             |  50 +++-
 src/dalia/gradient_methods/__init__.py        |   4 +-
 src/dalia/gradient_methods/smart_gradient.py  | 240 ++++++++++++------
 .../gradient_methods/vanilla_gradient.py      |  61 ++++-
 7 files changed, 297 insertions(+), 107 deletions(-)

diff --git a/src/dalia/configs/dalia_config.py b/src/dalia/configs/dalia_config.py
index b37dd736..c8d2a015 100644
--- a/src/dalia/configs/dalia_config.py
+++ b/src/dalia/configs/dalia_config.py
@@ -6,6 +6,12 @@
 
 from pydantic import BaseModel, ConfigDict, PositiveInt
 
+from dalia.configs.gradient_method_config import (
+    GradientMethodConfig,
+    SmartGradientConfig,
+    VanillaGradientConfig,
+)
+
 
 class SolverConfig(BaseModel):
     model_config = ConfigDict(extra="forbid")
@@ -20,9 +26,11 @@ class BFGSConfig(BaseModel):
 
     max_iter: PositiveInt = 100
     jac: bool = True
-    
-    maxcor: PositiveInt = 10 # maximum number of past gradient vectors to store -> good default: dim(theta)
-    maxls: PositiveInt = 20 # maximum number of line search iterations
+
+    maxcor: PositiveInt = (
+        10  # maximum number of past gradient vectors to store -> good default: dim(theta)
+    )
+    maxls: PositiveInt = 20  # maximum number of line search iterations
 
     gtol: float = 1e-1
     # c1: float = 1e-4  # only relevant for BFGS not for L-BFGS-B
@@ -36,11 +44,12 @@ class DaliaConfig(BaseModel):
     # --- Simulation parameters ------------------------------------------------
     solver: SolverConfig = SolverConfig()
     minimize: BFGSConfig = BFGSConfig()
+    gradient_method: GradientMethodConfig = SmartGradientConfig()
 
     # exit BFGS early if the reduction in the objective function is less than f_reduction_tol after f_reduction_lag iterations
     f_reduction_lag: int = 3
     f_reduction_tol: float = 1e-4
-    
+
     # exit BFGS early if the change in theta is less than theta_reduction_tol after theta_reduction_lag iterations
     theta_reduction_lag: int = 3
     theta_reduction_tol: float = 1e-4
diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py
index 9782e342..12d56bf1 100644
--- a/src/dalia/configs/gradient_method_config.py
+++ b/src/dalia/configs/gradient_method_config.py
@@ -9,14 +9,19 @@ class GradientMethodConfig(BaseModel, ABC):
 
     # Input folder for this specific submodel
     input_dir: str = None
-    type: Literal["vanilla_gradient", "smart_gradient"] = "smart_gradient"
+    type: Literal["vanilla_gradient", "smart_gradient"] = None
 
     finite_difference_epsilon: float = 1e-3
 
-class VanillaGradientConfig(GradientMethodConfig): ...
+
+class VanillaGradientConfig(GradientMethodConfig):
+    type: Literal["vanilla_gradient"] = "vanilla_gradient"
+
 
 class SmartGradientConfig(GradientMethodConfig):
+    type: Literal["smart_gradient"] = "smart_gradient"
+
     # The diagonal noise ensure to avoid singularities in the QR decomposition
     diagonal_noise: float = 1e-8
     # Threshold below which scaling is not performed
-    scaling_threashold: float = 1e-12
\ No newline at end of file
+    scaling_threshold: float = 1e-12
diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py
index b19a1088..083b5025 100644
--- a/src/dalia/core/dalia.py
+++ b/src/dalia/core/dalia.py
@@ -9,7 +9,7 @@
 from dalia.configs.dalia_config import DaliaConfig
 from dalia.core.model import Model
 from dalia.solvers import DenseSolver, DistSerinvSolver, SerinvSolver, SparseSolver
-from dalia.gradient_methods import VanillaGradient
+from dalia.gradient_methods import VanillaGradient, SmartGradient
 from dalia.utils import (
     DummyCommunicator,
     add_str_header,
@@ -192,9 +192,22 @@ def __init__(
                 )
 
         # --- Initialize Gradient Method
-        self.gradient_method = VanillaGradient(
-            basis_size=self.model.n_hyperparameters, finite_difference_epsilon=1e-3
-        )
+        if self.config.gradient_method.type == "vanilla_gradient":
+            self.gradient_method = VanillaGradient(
+                basis_size=self.model.n_hyperparameters,
+                finite_difference_epsilon=self.config.gradient_method.finite_difference_epsilon,
+            )
+        elif self.config.gradient_method.type == "smart_gradient":
+            self.gradient_method = SmartGradient(
+                basis_size=self.model.n_hyperparameters,
+                finite_difference_epsilon=self.config.gradient_method.finite_difference_epsilon,
+                diagonal_noise=self.config.gradient_method.diagonal_noise,
+                scaling_threshold=self.config.gradient_method.scaling_threshold,
+            )
+        else:
+            raise ValueError(
+                f"Unknown gradient method type: {self.config.gradient_method.type}"
+            )
 
         # --- Set up recurrent variables
         self.gradient_f = xp.zeros(self.model.n_hyperparameters, dtype=xp.float64)
diff --git a/src/dalia/core/gradient_method.py b/src/dalia/core/gradient_method.py
index 2deadc18..3bf30bf1 100644
--- a/src/dalia/core/gradient_method.py
+++ b/src/dalia/core/gradient_method.py
@@ -6,12 +6,54 @@
 class GradientMethod(ABC):
     """Core class for gradient computation methods."""
 
-    def __init__(self, basis_size, finite_difference_epsilon):
-        self.basis = xp.identity(basis_size, dtype=xp.float64)
+    def __init__(self, basis_size, finite_difference_epsilon) -> None:
+        """Initialize the gradient computation method.
+
+        Parameters
+        ----------
+        basis_size : int
+            The size of the basis for finite differences.
+        finite_difference_epsilon : float
+            The epsilon value for finite difference computations.
+
+        Returns
+        -------
+        None
+        """
+        self.basis_size = basis_size
+        self.basis = xp.identity(self.basis_size, dtype=xp.float64)
         self.finite_difference_epsilon = finite_difference_epsilon
 
     @abstractmethod
-    def get_evaluation_directions(self, direction_matrix, theta) -> None: ...
+    def get_evaluation_directions(self, direction_matrix, theta) -> None:
+        """Get the evaluation directions for the gradient computation.
+
+        Parameters
+        ----------
+        direction_matrix : xp.ndarray
+            The matrix to store the evaluation directions.
+        theta : xp.ndarray
+            The current (hyper)parameter values.
+
+        Returns
+        -------
+        None
+        """
+        ...
 
     @abstractmethod
-    def compute_gradient(self, function_evaluations, gradient) -> None: ...
+    def compute_gradient(self, function_evaluations, gradient) -> None:
+        """Compute the gradient using finite differences.
+
+        Parameters
+        ----------
+        function_evaluations : xp.ndarray
+            The function evaluations at the current and perturbed points.
+        gradient : xp.ndarray
+            The array to store the computed gradient.
+
+        Returns
+        -------
+        None
+        """
+        ...
diff --git a/src/dalia/gradient_methods/__init__.py b/src/dalia/gradient_methods/__init__.py
index 95487c21..4e807656 100644
--- a/src/dalia/gradient_methods/__init__.py
+++ b/src/dalia/gradient_methods/__init__.py
@@ -1,6 +1,6 @@
 # Copyright 2024-2025 DALIA authors. All rights reserved.
 
 from dalia.gradient_methods.vanilla_gradient import VanillaGradient
-# from dalia.gradient_methods.smart_gradient import SmartGradient
+from dalia.gradient_methods.smart_gradient import SmartGradient
 
-__all__ = ["VanillaGradient"]
+__all__ = ["VanillaGradient", "SmartGradient"]
diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py
index 0c82af65..71d75f7c 100644
--- a/src/dalia/gradient_methods/smart_gradient.py
+++ b/src/dalia/gradient_methods/smart_gradient.py
@@ -1,111 +1,193 @@
-from dalia.core.gradient_method import GradientMethod
+from numpy import random as rand
+
 from dalia import xp
 
+from dalia.core.gradient_method import GradientMethod
+from dalia.utils import get_device
+
 
 class SmartGradient(GradientMethod):
     """Smart gradient computation method.
-    
-    
+
+    This method adaptively updates the basis used for finite difference
+    gradient estimation based on the changes in the (hyper)parameter values.
+    It employs QR decomposition to maintain an orthogonal basis and scales
+    the updates to ensure numerical stability.
+
     References
     ----------
-    .. [1] Esmail Abdul Fattah, Janet Van Niekerk, Håvard Rue. 
-        Smart Gradient - An adaptive technique for improving 
-        gradient estimation. Foundations of Data Science, 2022, 
+    .. [1] Esmail Abdul Fattah, Janet Van Niekerk, Håvard Rue.
+        Smart Gradient - An adaptive technique for improving
+        gradient estimation. Foundations of Data Science, 2022,
         4(1): 123-136. doi: 10.3934/fods.2021037
     """
 
-    def __init__(self):
-        super().__init__()
-
-        # SMART GRADIENT STUFF
-        # self.fd_step = xp.cbrt(xp.finfo(float).eps)
-        self.fd_step = 1e-3
-       
-        self.noise_stddev = 7e-8 # To avoid singularity during QR
-       
-        self.G = xp.identity(self.model.n_hyperparameters)
-        self.c_G = xp.identity(self.model.n_hyperparameters)
-        self.prev_theta = xp.zeros(self.model.n_hyperparameters)
-        self.curr_theta = xp.zeros(self.model.n_hyperparameters)
+    def __init__(
+        self,
+        basis_size,
+        finite_difference_epsilon,
+        diagonal_noise=1e-8,
+        scaling_threshold=1e-12,
+    ) -> None:
+        """Initialize the gradient computation method.
+
+        Parameters
+        ----------
+        basis_size : int
+            The size of the basis for finite differences.
+        finite_difference_epsilon : float
+            The epsilon value for finite difference computations.
+        diagonal_noise : float
+            The diagonal noise to avoid singularities in the QR decomposition.
+        scaling_threshold : float
+            The threshold below which scaling is not performed.
+
+        Returns
+        -------
+        None
+        """
+        super().__init__(
+            basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon
+        )
+        self.diagonal_noise = diagonal_noise
+        self.scaling_threshold = scaling_threshold
+        self.temp_basis = xp.identity(self.basis_size)
+        self.prev_theta = xp.zeros(self.basis_size)
+        self.curr_theta = xp.zeros(self.basis_size)
         self.count = 0
-        self.rng = np.random.default_rng()
-
-    def get_evaluation_directions(self):
-        ...
-        # theta_mat is called gradient_direction_for_finite_difference in INLA
-        # Could be gradient_directions_matrix
-
-    def compute_gradient(self):
-        ...
-
-
-
-    def _transformed_fun(self, phi):
-        return self.curr_theta + self.G @ phi
-
-    def _scale(self, x):
+        self.rng = rand.default_rng()
+
+    def _transformed_fun(self, phi) -> xp.ndarray:
+        """Transform the input using the current theta and basis.
+
+        Parameters
+        ----------
+        phi : xp.ndarray
+            The input to transform.
+
+        Returns
+        -------
+        xp.ndarray
+            The transformed output.
+        """
+        return self.curr_theta + self.basis @ phi
+
+    def _scale(self, x) -> xp.ndarray:
+        """Scale the input vector.
+
+        Parameters
+        ----------
+        x : xp.ndarray
+            The input vector to scale.
+
+        Returns
+        -------
+        xp.ndarray
+            The scaled output vector.
+        """
         mean = xp.mean(x)
         std = xp.std(x, ddof=1)
-        if std < 1e-12:
+        if std < self.scaling_threshold:
             return x - mean
         return (x - mean) / std
 
-    def _update_G(self, current_theta):
+    def _update_basis(self, current_theta) -> None:
+        """Update the basis using the current theta.
+
+        Parameters
+        ----------
+        current_theta : xp.ndarray
+            The current (hyper)parameter values.
+
+        Returns
+        -------
+        None
+        """
         self.curr_theta = current_theta
-        self.c_G = xp.roll(self.c_G, 1, axis=1)
+        self.temp_basis = xp.roll(self.temp_basis, 1, axis=1)
         xdiff = current_theta - self.prev_theta
-        xdiff += get_device(self.rng.normal(0.0, self.noise_stddev, self.model.n_hyperparameters))
-        self.c_G[:, 0] = self._scale(xdiff)
-        self.G = self.c_G
+        xdiff += get_device(self.rng.normal(0.0, self.diagonal_noise, self.basis_size))
+        self.temp_basis[:, 0] = self._scale(xdiff)
+        self.basis = self.temp_basis
 
-    def _orthogonalize_G(self):
+    def _orthogonalize_basis(self) -> None:
+        """Orthogonalize the basis using QR decomposition.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        None
+        """
         try:
-            Q, R = xp.linalg.qr(self.G)
-            self.G = Q
+            Q, R = xp.linalg.qr(self.basis)
+            self.basis = Q
         except xp.linalg.LinAlgError:
             print("Warning: QR decomposition failed. Resetting G to identity.")
-            self.G = xp.identity(self.model.n_hyperparameters)
+            self.basis = xp.identity(self.basis_size)
+
+    def get_evaluation_directions(self, direction_matrix, theta) -> None:
+        """Get the evaluation directions for the gradient computation.
 
-    def _get_original_grad(self, transformed_grad):
-        return xp.linalg.solve(self.G.T, transformed_grad)
+        Parameters
+        ----------
+        direction_matrix : xp.ndarray
+            The matrix to store the evaluation directions.
+        theta : xp.ndarray
+            The current (hyper)parameter values.
+
+        Returns
+        -------
+        None
+        """
+        theta_dev = get_device(theta)
+        direction_matrix.fill(0.0)
 
-    def _update_gradient_basis(self) -> None:
-        for i in range(self.model.n_hyperparameters):
-            self.gradient_basis[:, self.model.n_hyperparameters - i - 1] = self.gradient_basis[:, self.model.n_hyperparameters - i - 2]
-        
-    
-    def _construct_f_evaluation_points(self, theta_i: NDArray) -> None:
         if self.count > 0:
-            self._update_G(theta_i)
+            self._update_basis(theta_dev)
         else:
-            self.curr_theta = theta_i  # Set the starting point
+            self.curr_theta = theta_dev  # Set the starting point
 
-        self._orthogonalize_G()
+        self._orthogonalize_basis()
 
-        self.prev_theta = xp.copy(theta_i)
+        self.prev_theta = xp.copy(theta_dev)
         self.count += 1
 
-        # Initialize central difference scheme matrix
-        self.eps_mat[:] = self.fd_step * self.gradient_basis
-        self.theta_mat[:] = xp.zeros(
-            (self.model.theta.size, self.n_f_evaluations), dtype=xp.float64
+        # First column is the current theta
+        direction_matrix[:, 0] = theta_dev
+        # Next basis_size columns are + epsilon * e_i
+        direction_matrix[:, 1 : 1 + self.basis_size] += (
+            self.finite_difference_epsilon * self.basis
+        )
+        # Next basis_size columns are - epsilon * e_i
+        direction_matrix[:, self.basis_size + 1 :] -= (
+            self.finite_difference_epsilon * self.basis
         )
 
-        self.curr_theta.T
-
-        self.theta_mat[:, 0] = xp.asarray(self.curr_theta.T)
-
-        self.theta_mat[:, 1 : 1 + self.model.n_hyperparameters] += self.eps_mat
-        self.theta_mat[
-            :, self.model.n_hyperparameters + 1 : self.n_f_evaluations
-        ] -= self.eps_mat
-
-        for i in range(1, self.n_f_evaluations):
-            self.theta_mat[:, i] = self._transformed_fun(phi=self.theta_mat[:, i]).T
-
-    def _compute_gradient(self):
-        for i in range(self.model.n_hyperparameters):
-            self.gradient_f[i] = (
-                self.f_values_i[i + 1]
-                - self.f_values_i[self.model.n_hyperparameters + i + 1]
-            ) / (2 * self.fd_step)
+        for i in range(1, direction_matrix.shape[1]):
+            direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i]).T
+
+    def compute_gradient(self, function_evaluations, gradient) -> None:
+        """Compute the gradient using finite differences.
+
+        Parameters
+        ----------
+        function_evaluations : xp.ndarray
+            The function evaluations at the current and perturbed points.
+        gradient : xp.ndarray
+            The array to store the computed gradient.
+
+        Returns
+        -------
+        None
+        """
+        for i in range(self.basis_size):
+            gradient[i] = (
+                function_evaluations[i + 1]
+                - function_evaluations[self.basis_size + i + 1]
+            ) / (2.0 * self.finite_difference_epsilon)
+
+        # Transform back the gradient into the original basis
+        gradient[:] = xp.linalg.solve(self.basis.T, gradient)
diff --git a/src/dalia/gradient_methods/vanilla_gradient.py b/src/dalia/gradient_methods/vanilla_gradient.py
index a6f66dba..20bdabde 100644
--- a/src/dalia/gradient_methods/vanilla_gradient.py
+++ b/src/dalia/gradient_methods/vanilla_gradient.py
@@ -7,33 +7,72 @@
 class VanillaGradient(GradientMethod):
     """Vanilla finite difference gradient computation method."""
 
-    def __init__(self, basis_size, finite_difference_epsilon):
+    def __init__(self, basis_size, finite_difference_epsilon) -> None:
+        """Initialize the gradient computation method.
+
+        Parameters
+        ----------
+        basis_size : int
+            The size of the basis for finite differences.
+        finite_difference_epsilon : float
+            The epsilon value for finite difference computations.
+
+        Returns
+        -------
+        None
+        """
         super().__init__(
             basis_size=basis_size, finite_difference_epsilon=finite_difference_epsilon
         )
 
     def get_evaluation_directions(self, direction_matrix, theta) -> None:
-        direction_matrix.fill(0.0)
-        direction_matrix[:, 0] = get_device(theta)
+        """Get the evaluation directions for the gradient computation.
 
-        print(f"theta: {theta}")
+        Parameters
+        ----------
+        direction_matrix : xp.ndarray
+            The matrix to store the evaluation directions.
+        theta : xp.ndarray
+            The current (hyper)parameter values.
 
+        Returns
+        -------
+        None
+        """
+        theta_dev = get_device(theta)
+        direction_matrix.fill(0.0)
+
+        # First column is the current theta
+        direction_matrix[:, 0] = theta_dev
+        # Next basis_size columns are + epsilon * e_i
         direction_matrix[
-            :, 1 : self.basis.shape[0] + 1
+            :, 1 : self.basis_size + 1
         ] += self.finite_difference_epsilon * self.basis + xp.repeat(
-            get_device(theta).reshape(-1, 1), self.basis.shape[0], 1
+            theta_dev.reshape(-1, 1), self.basis_size, 1
         )
+        # Next basis_size columns are - epsilon * e_i
         direction_matrix[
-            :, 1 + self.basis.shape[0] :
+            :, 1 + self.basis_size :
         ] -= self.finite_difference_epsilon * self.basis - xp.repeat(
-            get_device(theta).reshape(-1, 1), self.basis.shape[0], 1
+            theta_dev.reshape(-1, 1), self.basis_size, 1
         )
 
     def compute_gradient(self, function_evaluations, gradient) -> None:
+        """Compute the gradient using finite differences.
+
+        Parameters
+        ----------
+        function_evaluations : xp.ndarray
+            The function evaluations at the current and perturbed points.
+        gradient : xp.ndarray
+            The array to store the computed gradient.
 
-        print(f"IG: function_evaluations: {function_evaluations}")
-        for i in range(self.basis.shape[0]):
+        Returns
+        -------
+        None
+        """
+        for i in range(self.basis_size):
             gradient[i] = (
                 function_evaluations[i + 1]
-                - function_evaluations[self.basis.shape[0] + i + 1]
+                - function_evaluations[self.basis_size + i + 1]
             ) / (2.0 * self.finite_difference_epsilon)

From eaabad6f94dc6d2ff6dc25700bd25b9f1712cb5c Mon Sep 17 00:00:00 2001
From: vincent-maillou <vmaillou@iis.ee.ethz.ch>
Date: Thu, 11 Sep 2025 15:14:01 +0200
Subject: [PATCH 5/6] smart_gradient is working

---
 src/dalia/core/dalia.py                      | 10 ----------
 src/dalia/gradient_methods/smart_gradient.py | 20 +++++++++++---------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/dalia/core/dalia.py b/src/dalia/core/dalia.py
index 083b5025..cc4a6182 100644
--- a/src/dalia/core/dalia.py
+++ b/src/dalia/core/dalia.py
@@ -587,8 +587,6 @@ def _objective_function(
             direction_matrix=self.theta_mat, theta=theta_i
         )
 
-        print(self.theta_mat)
-
         # Proceed to the parallel function evaluation
         for feval_i in range(self.n_f_evaluations - 1, -1, -1):
             # Perform the evaluation in reverse order so that the stored and returned
@@ -631,14 +629,6 @@ def _objective_function(
             )
         self.iter += 1
 
-        print(f"self.f_values_i: {self.f_values_i}")
-        print(f"self.gradient_f: {self.gradient_f}")
-        print(f"f_0: {f_0}")
-        print(f"grad_f: {grad_f}")
-
-        """ if self.iter == 2:
-            exit() """
-
         return (f_0, grad_f)
 
     def _evaluate_f(
diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py
index 71d75f7c..495db6f0 100644
--- a/src/dalia/gradient_methods/smart_gradient.py
+++ b/src/dalia/gradient_methods/smart_gradient.py
@@ -70,7 +70,7 @@ def _transformed_fun(self, phi) -> xp.ndarray:
         xp.ndarray
             The transformed output.
         """
-        return self.curr_theta + self.basis @ phi
+        return (self.curr_theta + self.basis @ phi).T
 
     def _scale(self, x) -> xp.ndarray:
         """Scale the input vector.
@@ -125,7 +125,9 @@ def _orthogonalize_basis(self) -> None:
             Q, R = xp.linalg.qr(self.basis)
             self.basis = Q
         except xp.linalg.LinAlgError:
-            print("Warning: QR decomposition failed. Resetting G to identity.")
+            print(
+                "Warning: QR decomposition failed. Resetting G to identity.", flush=True
+            )
             self.basis = xp.identity(self.basis_size)
 
     def get_evaluation_directions(self, direction_matrix, theta) -> None:
@@ -158,16 +160,16 @@ def get_evaluation_directions(self, direction_matrix, theta) -> None:
         # First column is the current theta
         direction_matrix[:, 0] = theta_dev
         # Next basis_size columns are + epsilon * e_i
-        direction_matrix[:, 1 : 1 + self.basis_size] += (
-            self.finite_difference_epsilon * self.basis
-        )
+        direction_matrix[
+            :, 1 : 1 + self.basis_size
+        ] += self.finite_difference_epsilon * xp.eye(self.basis_size)
         # Next basis_size columns are - epsilon * e_i
-        direction_matrix[:, self.basis_size + 1 :] -= (
-            self.finite_difference_epsilon * self.basis
-        )
+        direction_matrix[
+            :, self.basis_size + 1 :
+        ] -= self.finite_difference_epsilon * xp.eye(self.basis_size)
 
         for i in range(1, direction_matrix.shape[1]):
-            direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i]).T
+            direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i])
 
     def compute_gradient(self, function_evaluations, gradient) -> None:
         """Compute the gradient using finite differences.

From fb45be483aa5d6aaead3012c0b67513af9c5134a Mon Sep 17 00:00:00 2001
From: vincent-maillou <vmaillou@iis.ee.ethz.ch>
Date: Thu, 11 Sep 2025 17:28:17 +0200
Subject: [PATCH 6/6] Simplification of smart gradient method

---
 src/dalia/configs/gradient_method_config.py  |  1 -
 src/dalia/gradient_methods/smart_gradient.py | 26 +++++---------------
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/src/dalia/configs/gradient_method_config.py b/src/dalia/configs/gradient_method_config.py
index 12d56bf1..a02443cb 100644
--- a/src/dalia/configs/gradient_method_config.py
+++ b/src/dalia/configs/gradient_method_config.py
@@ -8,7 +8,6 @@ class GradientMethodConfig(BaseModel, ABC):
     model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
 
     # Input folder for this specific submodel
-    input_dir: str = None
     type: Literal["vanilla_gradient", "smart_gradient"] = None
 
     finite_difference_epsilon: float = 1e-3
diff --git a/src/dalia/gradient_methods/smart_gradient.py b/src/dalia/gradient_methods/smart_gradient.py
index 495db6f0..71845da6 100644
--- a/src/dalia/gradient_methods/smart_gradient.py
+++ b/src/dalia/gradient_methods/smart_gradient.py
@@ -57,21 +57,6 @@ def __init__(
         self.count = 0
         self.rng = rand.default_rng()
 
-    def _transformed_fun(self, phi) -> xp.ndarray:
-        """Transform the input using the current theta and basis.
-
-        Parameters
-        ----------
-        phi : xp.ndarray
-            The input to transform.
-
-        Returns
-        -------
-        xp.ndarray
-            The transformed output.
-        """
-        return (self.curr_theta + self.basis @ phi).T
-
     def _scale(self, x) -> xp.ndarray:
         """Scale the input vector.
 
@@ -162,14 +147,15 @@ def get_evaluation_directions(self, direction_matrix, theta) -> None:
         # Next basis_size columns are + epsilon * e_i
         direction_matrix[
             :, 1 : 1 + self.basis_size
-        ] += self.finite_difference_epsilon * xp.eye(self.basis_size)
+        ] += self.finite_difference_epsilon * self.basis + xp.repeat(
+            theta_dev.reshape(-1, 1), self.basis_size, 1
+        )
         # Next basis_size columns are - epsilon * e_i
         direction_matrix[
             :, self.basis_size + 1 :
-        ] -= self.finite_difference_epsilon * xp.eye(self.basis_size)
-
-        for i in range(1, direction_matrix.shape[1]):
-            direction_matrix[:, i] = self._transformed_fun(phi=direction_matrix[:, i])
+        ] -= self.finite_difference_epsilon * self.basis - xp.repeat(
+            theta_dev.reshape(-1, 1), self.basis_size, 1
+        )
 
     def compute_gradient(self, function_evaluations, gradient) -> None:
         """Compute the gradient using finite differences.