deepmodeling · AsymmetryChou · Jun 19, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/dpnegf/negf/device_property.py b/dpnegf/negf/device_property.py
@@ -135,7 +135,7 @@ def set_leadLR(self, lead_L, lead_R):
 
 
     def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=True, Vbias=None,
-                           HS_inmem:bool=True, need_lesser:bool=False, need_greater:bool=False, need_gr_lc:bool=True):
+                           HS_inmem:bool=True, need_lesser:bool=False, need_greater:bool=False, need_gr_lc:bool=False):
         ''' computes the Green's function for a given energy and k-point in device.
 
         the tags used here to identify different Green's functions follows the NEGF theory 
@@ -273,11 +273,16 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
         else:
             s_in = 0
 
+        # gr_left is only consumed inside the lesser/greater forward pass of the
+        # kernel. If neither is active, the per-block list would sit on the GPU
+        # unread; ask the kernel to drop it so its slots are freed mid-sweep.
+        keep_gr_left = bool(need_lesser or need_greater)
         ans = recursive_gf(energy, hl=self.hl, hd=self.hd, hu=self.hu,
                             sd=self.sd, su=self.su, sl=self.sl,
                             left_se=seL, right_se=seR, seP=None, s_in=s_in,
                             s_out=None, eta=eta_device, E_ref=self.E_ref,
-                            need_lesser=need_lesser, need_greater=need_greater, need_gr_lc=need_gr_lc)
+                            need_lesser=need_lesser, need_greater=need_greater,
+                            need_gr_lc=need_gr_lc, keep_gr_left=keep_gr_left)
             # green shape [[g_trans, grd, grl,...],[g_trans, ...]]
 
         for t in range(len(tags)):
@@ -290,6 +295,16 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
 
         # self.green = update_temp_file(update_fn=fn, file_path=GFpath, ee=ee, tags=tags, info="Computing Green's Function")
 
+    def release_greenfuncs(self):
+        '''Drop the Green's-function dict so the underlying rgf_device storage
+        can be freed before the next energy chunk. H/S blocks are kept resident
+        (they are k,V-dependent, not energy-dependent). The runner is
+        responsible for restoring scalar lead.se references before calling
+        this, so any batched [B,n,n] copies become collectable too.'''
+        self.greenfuncs = 0
+        if isinstance(self.rgf_device, torch.device) and self.rgf_device.type == "cuda":
+            torch.cuda.empty_cache()
+
     def _cal_current_(self, espacing):
         '''calculate the current based on the voltage difference 
 
@@ -312,14 +327,6 @@ def _cal_current_(self, espacing):
         xl = min(v_L, v_R)-4*self.kBT
         xu = max(v_L, v_R)+4*self.kBT
 
-        def fcn(e):
-            self.cal_green_function()
-
-        cc = leggauss(fcn=self._cal_tc_)
-
-        int_grid, int_weight = gauss_xw(xl=xl, xu=xu, n=int((xu-xl)/espacing))
-
-
         self.__CURRENT__ = simpson(y=(self.lead_L.fermi_dirac(self.ee+self.E_ref) 
                                     - self.lead_R.fermi_dirac(self.ee+self.E_ref)) * self.tc, x=self.ee)
 

diff --git a/dpnegf/negf/recursive_green_cal.py b/dpnegf/negf/recursive_green_cal.py
@@ -4,7 +4,7 @@
 def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
                      sd, su, sl, s_in=0, s_out=0, eta=1e-5,
                      need_lesser=False, need_greater=False, need_gr_lc=False,
-                     stacked=False):
+                     stacked=False, keep_gr_left=True):
     """The recursive Green's function algorithm is taken from
     M. P. Anantram, M. S. Lundstrom and D. E. Nikonov, Proceedings of the IEEE, 96, 1511 - 1550 (2008)
     DOI: 10.1109/JPROC.2008.927355
@@ -92,10 +92,13 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
                 -mat_d[q + 1] - mat_l[q] @ gr_left[q] @ mat_u[q],
                 eye_bnn,
             )
+        # mat_d is dead after the forward sweep — backward sweep only reads mat_l/mat_u.
+        del mat_d
 
         grl = [None] * (num_of_matrices - 1)
         gru = [None] * (num_of_matrices - 1)
-        grd = [g.clone() for g in gr_left]
+        grd = [None] * num_of_matrices
+        grd[-1] = gr_left[-1].clone()
         g_trans = gr_left[-1].clone()
         gr_lc = [g_trans] if need_gr_lc else None
         for q in range(num_of_matrices - 2, -1, -1):
@@ -106,8 +109,13 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
             g_trans = gU @ g_trans
             if need_gr_lc:
                 gr_lc.append(g_trans)
+            del gU
         if need_gr_lc:
             gr_lc.reverse()
+        if not need_lesser and not need_greater:
+            # Stacked path: mat_l/mat_u are 4-D and can't be slice-freed mid-loop;
+            # they are dead now if no lesser/greater pass will read them.
+            del mat_l, mat_u
 
         gnd = gnl = gnu = gin_left = None
         if need_lesser:
@@ -120,7 +128,8 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
 
             gnl = [None] * (num_of_matrices - 1)
             gnu = [None] * (num_of_matrices - 1)
-            gnd = [g.clone() for g in gin_left]
+            gnd = [None] * num_of_matrices
+            gnd[-1] = gin_left[-1].clone()
             for q in range(num_of_matrices - 2, -1, -1):
                 gLmH = mat_l[q] @ gr_left[q].mH         # hoisted: used twice
                 gnl[q] = grd[q + 1] @ mat_l[q] @ gin_left[q] + gnd[q + 1] @ gLmH  # (B10)
@@ -140,7 +149,8 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
 
             gpl = [None] * (num_of_matrices - 1)
             gpu = [None] * (num_of_matrices - 1)
-            gpd = [g.clone() for g in gip_left]
+            gpd = [None] * num_of_matrices
+            gpd[-1] = gip_left[-1].clone()
             for q in range(num_of_matrices - 2, -1, -1):
                 lcgc = mat_l[q].conj() @ gr_left[q].conj()  # hoisted: used twice
                 gpl[q] = grd[q + 1] @ mat_l[q] @ gip_left[q] + gpd[q + 1] @ lcgc
@@ -150,6 +160,8 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
                          (gru[q] @ mat_l[q] @ gip_left[q])
                 gpu[q] = gpl[q].mH
 
+        if not keep_gr_left:
+            gr_left = None
         return _pack_ans(g_trans, gr_lc, grd, grl, gru, gr_left,
                          gnd, gnl, gnu, gin_left,
                          gpd, gpl, gpu, gip_left,
@@ -161,7 +173,9 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
     e_bcast = (energy + 1j * eta).view(-1, 1, 1)
 
     for jj in range(len(mat_d_list)):
-        mat_d_list[jj] = mat_d_list[jj] - e_bcast * sd[jj]
+        # In-place: mat_d_list is a fresh tensor (wrapper's `* 1.` copy on D),
+        # so we can fuse the energy shift without the e_bcast*sd transient.
+        mat_d_list[jj].addcmul_(sd[jj], e_bcast, value=-1)
     for jj in range(len(mat_l_list)):
         mat_l_list[jj] = mat_l_list[jj] - e_bcast * sl[jj]
     for jj in range(len(mat_u_list)):
@@ -183,18 +197,29 @@ def _batched_eye(n):
     # ------------------ retarded Green's function ----------------------
     gr_left = [None] * num_of_matrices
     gr_left[0] = tLA.solve(-mat_d_list[0], _batched_eye(mat_shapes[0][-1]))
+    mat_d_list[0] = None  # consumed; free immediately
 
     for q in range(num_of_matrices - 1):  # (B2)
         gr_left[q + 1] = tLA.solve(
             -mat_d_list[q + 1] - mat_l_list[q] @ gr_left[q] @ mat_u_list[q],
             _batched_eye(mat_shapes[q + 1][-1]),
         )
+        mat_d_list[q + 1] = None  # consumed; backward sweep only reads mat_l/mat_u.
 
     grl = [None] * (num_of_matrices - 1)
     gru = [None] * (num_of_matrices - 1)
-    grd = [i.clone() for i in gr_left]
+    grd = [None] * num_of_matrices
+    grd[-1] = gr_left[-1].clone()
     g_trans = gr_left[-1].clone()
     gr_lc = [g_trans] if need_gr_lc else None
+    # Slots that go dead at the end of iteration q:
+    #   - mat_l_list[q], mat_u_list[q]: only re-read by the lesser/greater branches.
+    #   - gr_left[q]: dead unless the lesser/greater branch will consume it OR
+    #                 the caller asked us to keep the list intact.
+    # Nulling per slot lets the caching allocator coalesce its free list inside the
+    # loop instead of holding a long fragmented tail until the sweep ends.
+    drop_lu = not need_lesser and not need_greater
+    drop_gl = drop_lu and not keep_gr_left
     for q in range(num_of_matrices - 2, -1, -1):
         gU = gr_left[q] @ mat_u_list[q]                            # hoisted
         grl[q] = grd[q + 1] @ mat_l_list[q] @ gr_left[q]           # (B5)
@@ -203,6 +228,12 @@ def _batched_eye(n):
         g_trans = gU @ g_trans
         if need_gr_lc:
             gr_lc.append(g_trans)
+        del gU
+        if drop_lu:
+            mat_l_list[q] = None
+            mat_u_list[q] = None
+        if drop_gl:
+            gr_left[q] = None
     if need_gr_lc:
         gr_lc.reverse()
 
@@ -219,7 +250,8 @@ def _batched_eye(n):
 
         gnl = [None] * (num_of_matrices - 1)
         gnu = [None] * (num_of_matrices - 1)
-        gnd = [i.clone() for i in gin_left]
+        gnd = [None] * num_of_matrices
+        gnd[-1] = gin_left[-1].clone()
 
         for q in range(num_of_matrices - 2, -1, -1):
             gLmH = mat_l_list[q] @ gr_left[q].mH                   # hoisted
@@ -243,7 +275,8 @@ def _batched_eye(n):
 
         gpl = [None] * (num_of_matrices - 1)
         gpu = [None] * (num_of_matrices - 1)
-        gpd = [i.clone() for i in gip_left]
+        gpd = [None] * num_of_matrices
+        gpd[-1] = gip_left[-1].clone()
 
         for q in range(num_of_matrices - 2, -1, -1):
             lcgc = mat_l_list[q].conj() @ gr_left[q].conj()        # hoisted
@@ -255,6 +288,8 @@ def _batched_eye(n):
                      (gru[q] @ mat_l_list[q] @ gip_left[q])
             gpu[q] = gpl[q].mH
 
+    if not keep_gr_left:
+        gr_left = None
     return _pack_ans(g_trans, gr_lc, grd, grl, gru, gr_left,
                      gnd, gnl, gnu, gin_left,
                      gpd, gpl, gpu, gip_left,
@@ -287,7 +322,8 @@ def _pack_ans(g_trans, gr_lc, grd, grl, gru, gr_left,
 
 
 def recursive_gf(energy, hl, hd, hu, sd, su, sl, left_se, right_se, seP=None, E_ref=0.0, s_in=0, s_out=0,
-                 eta=1e-5, need_lesser=False, need_greater=False, need_gr_lc=False):
+                 eta=1e-5, need_lesser=False, need_greater=False, need_gr_lc=False,
+                 keep_gr_left=True):
 
     """The recursive Green's function algorithm is taken from
     M. P. Anantram, M. S. Lundstrom and D. E. Nikonov, Proceedings of the IEEE, 96, 1511 - 1550 (2008)
@@ -364,8 +400,10 @@ def _to_batch(t):
         return t
 
     temp_mat_d_list = [_to_batch(hd[i]) * 1. for i in range(len(hd))]
-    temp_mat_l_list = [_to_batch(hl[i]) * 1. for i in range(len(hl))]
-    temp_mat_u_list = [_to_batch(hu[i]) * 1. for i in range(len(hu))]
+    # L and U are only subtracted out-of-place inside the kernel; the expanded
+    # view is fine, and skipping the copy saves K x B x n^2 per list.
+    temp_mat_l_list = [_to_batch(hl[i]) for i in range(len(hl))]
+    temp_mat_u_list = [_to_batch(hu[i]) for i in range(len(hu))]
     sd_b = [_to_batch(sd[i]) for i in range(len(sd))]
     sl_b = [_to_batch(sl[i]) for i in range(len(sl))]
     su_b = [_to_batch(su[i]) for i in range(len(su))]
@@ -421,15 +459,17 @@ def _to_batch(t):
                                need_lesser=need_lesser,
                                need_greater=need_greater,
                                need_gr_lc=need_gr_lc,
-                               stacked=True)
+                               stacked=True,
+                               keep_gr_left=keep_gr_left)
     else:
         ans = recursive_gf_cal(shift_energy, temp_mat_l_list, temp_mat_d_list, temp_mat_u_list,
                                sd_b, su_b, sl_b,
                                s_in=s_in_b, s_out=s_out_b, eta=eta,
                                need_lesser=need_lesser,
                                need_greater=need_greater,
                                need_gr_lc=need_gr_lc,
-                               stacked=False)
+                               stacked=False,
+                               keep_gr_left=keep_gr_left)
 
     if squeezed:
         ans = _squeeze_ans(ans)

diff --git a/dpnegf/runner/NEGF.py b/dpnegf/runner/NEGF.py
@@ -69,6 +69,20 @@ def __init__(self,
         self.rgf_device = rgf_device
         self.n_cpus = n_cpus
         self.e_batch_size = e_batch_size
+
+        # The RGF q-loop allocates/frees many small slabs; with the default
+        # cudaMalloc-backed caching allocator this fragments quickly on long
+        # energy grids. expandable_segments avoids that, but must be set before
+        # torch initializes its CUDA context — by the time we get here it's
+        # already live, so we can only nudge the user.
+        if isinstance(self.rgf_device, torch.device) and self.rgf_device.type == "cuda":
+            if "expandable_segments" not in os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""):
+                log.warning(
+                    "RGF on CUDA can fragment the caching allocator on long energy "
+                    "grids. Consider exporting "
+                    "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True BEFORE invoking "
+                    "dpnegf (must be set before torch's CUDA context initializes)."
+                )
 
         # get the parameters
         self.ele_T = ele_T
@@ -585,6 +599,37 @@ def prepare_self_energy(self, scf_require: bool) -> None:
 
 
 
+    def _auto_chunk_size(self, n_grid):
+        """Pick a chunk size from free CUDA memory when the user didn't set
+        ``e_batch_size``. Returns the full grid length on CPU / when the
+        device geometry isn't probable yet.
+
+        Per-energy peak (post per-slot-release, complex128) approximated as
+            bytes_per_E ~= C * K * n_max**2 * 16
+        with C bundling the live tensors in the worst backward-sweep slot
+        (grd full + grl + gru full + decaying gr_left tail + gU + transients).
+        C=10 with a 0.7x free-memory budget; deliberately conservative because
+        without expandable_segments the allocator can't defragment on demand.
+        """
+        rgf_dev = self.rgf_device
+        if not (isinstance(rgf_dev, torch.device) and rgf_dev.type == "cuda"):
+            return n_grid
+        try:
+            free_bytes, _total = torch.cuda.mem_get_info(rgf_dev)
+            n_max = max(int(b.shape[-1]) for b in self.deviceprop.hd)
+            K = len(self.deviceprop.hd)
+        except Exception:
+            return n_grid
+        per_e = 10 * K * (n_max ** 2) * 16
+        if per_e <= 0:
+            return n_grid
+        b = max(1, min(n_grid, int(0.7 * free_bytes) // per_e))
+        log.info(
+            f"auto e_batch_size={b} (free={free_bytes/2**30:.2f} GiB, "
+            f"per_E~={per_e/2**20:.1f} MiB, K={K}, n_max={n_max})"
+        )
+        return b
+
     def negf_compute(self,scf_require=False,Vbias=None):
 
         assert scf_require is not None, "scf_require should be set to True or False"
@@ -704,7 +749,10 @@ def negf_compute(self,scf_require=False,Vbias=None):
                                 self.out.setdefault('LDOS', {}).setdefault(str(k), []).append(self.compute_LDOS(k))
                     else:
                         # Non-SCF: solve a whole chunk of energies in one batched recursive_gf call.
-                        chunk = self.e_batch_size if self.e_batch_size is not None else len(self.uni_grid)
+                        if self.e_batch_size is not None:
+                            chunk = self.e_batch_size
+                        else:
+                            chunk = self._auto_chunk_size(len(self.uni_grid))
                         for e_chunk in torch.split(self.uni_grid, chunk):
                             e_batch_size = len(e_chunk)
                             log.info(
@@ -741,19 +789,31 @@ def negf_compute(self,scf_require=False,Vbias=None):
                                 )
 
                             if self.out_dos:
-                                self.out.setdefault('DOS', {}).setdefault(str(k), []).append(self.compute_DOS(k).reshape(-1))
+                                self.out.setdefault('DOS', {}).setdefault(str(k), []).append(self.compute_DOS(k).reshape(-1).cpu())
                             if self.out_tc or self.out_current_nscf:
-                                self.out.setdefault('T_k', {}).setdefault(str(k), []).append(self.compute_TC(k).reshape(-1))
+                                self.out.setdefault('T_k', {}).setdefault(str(k), []).append(self.compute_TC(k).reshape(-1).cpu())
                             if self.out_ldos:
                                 ldos_chunk = self.compute_LDOS(k)
                                 if ldos_chunk.ndim == 1:  # scalar-E chunk → [na]
                                     ldos_chunk = ldos_chunk.unsqueeze(0)
-                                self.out.setdefault('LDOS', {}).setdefault(str(k), []).append(ldos_chunk)
-
-                        # Restore lead.se to scalar [n,n] so downstream scalar callers
-                        # (density modules, lcurrent loop, future SCF re-entry) see the expected shape.
-                        self.deviceprop.lead_L.se = seL_list[-1]
-                        self.deviceprop.lead_R.se = seR_list[-1]
+                                self.out.setdefault('LDOS', {}).setdefault(str(k), []).append(ldos_chunk.cpu())
+
+                        # Restore lead.se to a scalar [n,n] before releasing the GF
+                        # dict. For B>1 we clone the last per-E tensor so the new
+                        # lead.se doesn't share storage with anything still
+                        # referenced through seL_list/seR_list, then drop both
+                        # lists so release_greenfuncs's empty_cache() has the per-E
+                        # and stacked [B,n,n] copies to release.
+                        if e_batch_size > 1:
+                            self.deviceprop.lead_L.se = seL_list[-1].detach().clone()
+                            self.deviceprop.lead_R.se = seR_list[-1].detach().clone()
+                        else:
+                            # B=1 path: lead.se already IS the per-E [n,n] tensor;
+                            # preserve byte-identical behavior for the scalar case.
+                            self.deviceprop.lead_L.se = seL_list[-1]
+                            self.deviceprop.lead_R.se = seR_list[-1]
+                        del seL_list, seR_list
+                        self.deviceprop.release_greenfuncs()
 
                     # over energy loop in uni_gird
                     # The following code is for output properties before NEGF ends