Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions dpnegf/negf/device_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def set_leadLR(self, lead_L, lead_R):


def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=True, Vbias=None,
HS_inmem:bool=True, need_lesser:bool=False, need_greater:bool=False, need_gr_lc:bool=True):
HS_inmem:bool=True, need_lesser:bool=False, need_greater:bool=False, need_gr_lc:bool=False):
''' computes the Green's function for a given energy and k-point in device.

the tags used here to identify different Green's functions follows the NEGF theory
Expand Down Expand Up @@ -273,11 +273,16 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
else:
s_in = 0

# gr_left is only consumed inside the lesser/greater forward pass of the
# kernel. If neither is active, the per-block list would sit on the GPU
# unread; ask the kernel to drop it so its slots are freed mid-sweep.
keep_gr_left = bool(need_lesser or need_greater)
ans = recursive_gf(energy, hl=self.hl, hd=self.hd, hu=self.hu,
sd=self.sd, su=self.su, sl=self.sl,
left_se=seL, right_se=seR, seP=None, s_in=s_in,
s_out=None, eta=eta_device, E_ref=self.E_ref,
need_lesser=need_lesser, need_greater=need_greater, need_gr_lc=need_gr_lc)
need_lesser=need_lesser, need_greater=need_greater,
need_gr_lc=need_gr_lc, keep_gr_left=keep_gr_left)
# green shape [[g_trans, grd, grl,...],[g_trans, ...]]

for t in range(len(tags)):
Expand All @@ -290,6 +295,16 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr

# self.green = update_temp_file(update_fn=fn, file_path=GFpath, ee=ee, tags=tags, info="Computing Green's Function")

def release_greenfuncs(self):
'''Drop the Green's-function dict so the underlying rgf_device storage
can be freed before the next energy chunk. H/S blocks are kept resident
(they are k,V-dependent, not energy-dependent). The runner is
responsible for restoring scalar lead.se references before calling
this, so any batched [B,n,n] copies become collectable too.'''
self.greenfuncs = 0
if isinstance(self.rgf_device, torch.device) and self.rgf_device.type == "cuda":
torch.cuda.empty_cache()

def _cal_current_(self, espacing):
'''calculate the current based on the voltage difference

Expand All @@ -312,14 +327,6 @@ def _cal_current_(self, espacing):
xl = min(v_L, v_R)-4*self.kBT
xu = max(v_L, v_R)+4*self.kBT

def fcn(e):
self.cal_green_function()

cc = leggauss(fcn=self._cal_tc_)

int_grid, int_weight = gauss_xw(xl=xl, xu=xu, n=int((xu-xl)/espacing))


self.__CURRENT__ = simpson(y=(self.lead_L.fermi_dirac(self.ee+self.E_ref)
- self.lead_R.fermi_dirac(self.ee+self.E_ref)) * self.tc, x=self.ee)

Expand Down
66 changes: 53 additions & 13 deletions dpnegf/negf/recursive_green_cal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
sd, su, sl, s_in=0, s_out=0, eta=1e-5,
need_lesser=False, need_greater=False, need_gr_lc=False,
stacked=False):
stacked=False, keep_gr_left=True):
"""The recursive Green's function algorithm is taken from
M. P. Anantram, M. S. Lundstrom and D. E. Nikonov, Proceedings of the IEEE, 96, 1511 - 1550 (2008)
DOI: 10.1109/JPROC.2008.927355
Expand Down Expand Up @@ -92,10 +92,13 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
-mat_d[q + 1] - mat_l[q] @ gr_left[q] @ mat_u[q],
eye_bnn,
)
# mat_d is dead after the forward sweep — backward sweep only reads mat_l/mat_u.
del mat_d

grl = [None] * (num_of_matrices - 1)
gru = [None] * (num_of_matrices - 1)
grd = [g.clone() for g in gr_left]
grd = [None] * num_of_matrices
grd[-1] = gr_left[-1].clone()
g_trans = gr_left[-1].clone()
gr_lc = [g_trans] if need_gr_lc else None
for q in range(num_of_matrices - 2, -1, -1):
Expand All @@ -106,8 +109,13 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
g_trans = gU @ g_trans
if need_gr_lc:
gr_lc.append(g_trans)
del gU
if need_gr_lc:
gr_lc.reverse()
if not need_lesser and not need_greater:
# Stacked path: mat_l/mat_u are 4-D and can't be slice-freed mid-loop;
# they are dead now if no lesser/greater pass will read them.
del mat_l, mat_u

gnd = gnl = gnu = gin_left = None
if need_lesser:
Expand All @@ -120,7 +128,8 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,

gnl = [None] * (num_of_matrices - 1)
gnu = [None] * (num_of_matrices - 1)
gnd = [g.clone() for g in gin_left]
gnd = [None] * num_of_matrices
gnd[-1] = gin_left[-1].clone()
for q in range(num_of_matrices - 2, -1, -1):
gLmH = mat_l[q] @ gr_left[q].mH # hoisted: used twice
gnl[q] = grd[q + 1] @ mat_l[q] @ gin_left[q] + gnd[q + 1] @ gLmH # (B10)
Expand All @@ -140,7 +149,8 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,

gpl = [None] * (num_of_matrices - 1)
gpu = [None] * (num_of_matrices - 1)
gpd = [g.clone() for g in gip_left]
gpd = [None] * num_of_matrices
gpd[-1] = gip_left[-1].clone()
for q in range(num_of_matrices - 2, -1, -1):
lcgc = mat_l[q].conj() @ gr_left[q].conj() # hoisted: used twice
gpl[q] = grd[q + 1] @ mat_l[q] @ gip_left[q] + gpd[q + 1] @ lcgc
Expand All @@ -150,6 +160,8 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
(gru[q] @ mat_l[q] @ gip_left[q])
gpu[q] = gpl[q].mH

if not keep_gr_left:
gr_left = None
return _pack_ans(g_trans, gr_lc, grd, grl, gru, gr_left,
gnd, gnl, gnu, gin_left,
gpd, gpl, gpu, gip_left,
Expand All @@ -161,7 +173,9 @@ def recursive_gf_cal(energy, mat_l_list, mat_d_list, mat_u_list,
e_bcast = (energy + 1j * eta).view(-1, 1, 1)

for jj in range(len(mat_d_list)):
mat_d_list[jj] = mat_d_list[jj] - e_bcast * sd[jj]
# In-place: mat_d_list is a fresh tensor (wrapper's `* 1.` copy on D),
# so we can fuse the energy shift without the e_bcast*sd transient.
mat_d_list[jj].addcmul_(sd[jj], e_bcast, value=-1)
for jj in range(len(mat_l_list)):
mat_l_list[jj] = mat_l_list[jj] - e_bcast * sl[jj]
for jj in range(len(mat_u_list)):
Expand All @@ -183,18 +197,29 @@ def _batched_eye(n):
# ------------------ retarded Green's function ----------------------
gr_left = [None] * num_of_matrices
gr_left[0] = tLA.solve(-mat_d_list[0], _batched_eye(mat_shapes[0][-1]))
mat_d_list[0] = None # consumed; free immediately

for q in range(num_of_matrices - 1): # (B2)
gr_left[q + 1] = tLA.solve(
-mat_d_list[q + 1] - mat_l_list[q] @ gr_left[q] @ mat_u_list[q],
_batched_eye(mat_shapes[q + 1][-1]),
)
mat_d_list[q + 1] = None # consumed; backward sweep only reads mat_l/mat_u.

grl = [None] * (num_of_matrices - 1)
gru = [None] * (num_of_matrices - 1)
grd = [i.clone() for i in gr_left]
grd = [None] * num_of_matrices
grd[-1] = gr_left[-1].clone()
g_trans = gr_left[-1].clone()
gr_lc = [g_trans] if need_gr_lc else None
# Slots that go dead at the end of iteration q:
# - mat_l_list[q], mat_u_list[q]: only re-read by the lesser/greater branches.
# - gr_left[q]: dead unless the lesser/greater branch will consume it OR
# the caller asked us to keep the list intact.
# Nulling per slot lets the caching allocator coalesce its free list inside the
# loop instead of holding a long fragmented tail until the sweep ends.
drop_lu = not need_lesser and not need_greater
drop_gl = drop_lu and not keep_gr_left
for q in range(num_of_matrices - 2, -1, -1):
gU = gr_left[q] @ mat_u_list[q] # hoisted
grl[q] = grd[q + 1] @ mat_l_list[q] @ gr_left[q] # (B5)
Expand All @@ -203,6 +228,12 @@ def _batched_eye(n):
g_trans = gU @ g_trans
if need_gr_lc:
gr_lc.append(g_trans)
del gU
if drop_lu:
mat_l_list[q] = None
mat_u_list[q] = None
if drop_gl:
gr_left[q] = None
if need_gr_lc:
gr_lc.reverse()

Expand All @@ -219,7 +250,8 @@ def _batched_eye(n):

gnl = [None] * (num_of_matrices - 1)
gnu = [None] * (num_of_matrices - 1)
gnd = [i.clone() for i in gin_left]
gnd = [None] * num_of_matrices
gnd[-1] = gin_left[-1].clone()

for q in range(num_of_matrices - 2, -1, -1):
gLmH = mat_l_list[q] @ gr_left[q].mH # hoisted
Expand All @@ -243,7 +275,8 @@ def _batched_eye(n):

gpl = [None] * (num_of_matrices - 1)
gpu = [None] * (num_of_matrices - 1)
gpd = [i.clone() for i in gip_left]
gpd = [None] * num_of_matrices
gpd[-1] = gip_left[-1].clone()

for q in range(num_of_matrices - 2, -1, -1):
lcgc = mat_l_list[q].conj() @ gr_left[q].conj() # hoisted
Expand All @@ -255,6 +288,8 @@ def _batched_eye(n):
(gru[q] @ mat_l_list[q] @ gip_left[q])
gpu[q] = gpl[q].mH

if not keep_gr_left:
gr_left = None
return _pack_ans(g_trans, gr_lc, grd, grl, gru, gr_left,
gnd, gnl, gnu, gin_left,
gpd, gpl, gpu, gip_left,
Expand Down Expand Up @@ -287,7 +322,8 @@ def _pack_ans(g_trans, gr_lc, grd, grl, gru, gr_left,


def recursive_gf(energy, hl, hd, hu, sd, su, sl, left_se, right_se, seP=None, E_ref=0.0, s_in=0, s_out=0,
eta=1e-5, need_lesser=False, need_greater=False, need_gr_lc=False):
eta=1e-5, need_lesser=False, need_greater=False, need_gr_lc=False,
keep_gr_left=True):

"""The recursive Green's function algorithm is taken from
M. P. Anantram, M. S. Lundstrom and D. E. Nikonov, Proceedings of the IEEE, 96, 1511 - 1550 (2008)
Expand Down Expand Up @@ -364,8 +400,10 @@ def _to_batch(t):
return t

temp_mat_d_list = [_to_batch(hd[i]) * 1. for i in range(len(hd))]
temp_mat_l_list = [_to_batch(hl[i]) * 1. for i in range(len(hl))]
temp_mat_u_list = [_to_batch(hu[i]) * 1. for i in range(len(hu))]
# L and U are only subtracted out-of-place inside the kernel; the expanded
# view is fine, and skipping the copy saves K x B x n^2 per list.
temp_mat_l_list = [_to_batch(hl[i]) for i in range(len(hl))]
temp_mat_u_list = [_to_batch(hu[i]) for i in range(len(hu))]
sd_b = [_to_batch(sd[i]) for i in range(len(sd))]
sl_b = [_to_batch(sl[i]) for i in range(len(sl))]
su_b = [_to_batch(su[i]) for i in range(len(su))]
Expand Down Expand Up @@ -421,15 +459,17 @@ def _to_batch(t):
need_lesser=need_lesser,
need_greater=need_greater,
need_gr_lc=need_gr_lc,
stacked=True)
stacked=True,
keep_gr_left=keep_gr_left)
else:
ans = recursive_gf_cal(shift_energy, temp_mat_l_list, temp_mat_d_list, temp_mat_u_list,
sd_b, su_b, sl_b,
s_in=s_in_b, s_out=s_out_b, eta=eta,
need_lesser=need_lesser,
need_greater=need_greater,
need_gr_lc=need_gr_lc,
stacked=False)
stacked=False,
keep_gr_left=keep_gr_left)

if squeezed:
ans = _squeeze_ans(ans)
Expand Down
78 changes: 69 additions & 9 deletions dpnegf/runner/NEGF.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,20 @@ def __init__(self,
self.rgf_device = rgf_device
self.n_cpus = n_cpus
self.e_batch_size = e_batch_size

# The RGF q-loop allocates/frees many small slabs; with the default
# cudaMalloc-backed caching allocator this fragments quickly on long
# energy grids. expandable_segments avoids that, but must be set before
# torch initializes its CUDA context — by the time we get here it's
# already live, so we can only nudge the user.
if isinstance(self.rgf_device, torch.device) and self.rgf_device.type == "cuda":
if "expandable_segments" not in os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""):
log.warning(
"RGF on CUDA can fragment the caching allocator on long energy "
"grids. Consider exporting "
"PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True BEFORE invoking "
"dpnegf (must be set before torch's CUDA context initializes)."
)

# get the parameters
self.ele_T = ele_T
Expand Down Expand Up @@ -585,6 +599,37 @@ def prepare_self_energy(self, scf_require: bool) -> None:



def _auto_chunk_size(self, n_grid):
"""Pick a chunk size from free CUDA memory when the user didn't set
``e_batch_size``. Returns the full grid length on CPU / when the
device geometry isn't probable yet.

Per-energy peak (post per-slot-release, complex128) approximated as
bytes_per_E ~= C * K * n_max**2 * 16
with C bundling the live tensors in the worst backward-sweep slot
(grd full + grl + gru full + decaying gr_left tail + gU + transients).
C=10 with a 0.7x free-memory budget; deliberately conservative because
without expandable_segments the allocator can't defragment on demand.
"""
rgf_dev = self.rgf_device
if not (isinstance(rgf_dev, torch.device) and rgf_dev.type == "cuda"):
return n_grid
try:
free_bytes, _total = torch.cuda.mem_get_info(rgf_dev)
n_max = max(int(b.shape[-1]) for b in self.deviceprop.hd)
K = len(self.deviceprop.hd)
except Exception:
return n_grid
per_e = 10 * K * (n_max ** 2) * 16
if per_e <= 0:
return n_grid
b = max(1, min(n_grid, int(0.7 * free_bytes) // per_e))
log.info(
f"auto e_batch_size={b} (free={free_bytes/2**30:.2f} GiB, "
f"per_E~={per_e/2**20:.1f} MiB, K={K}, n_max={n_max})"
)
return b

def negf_compute(self,scf_require=False,Vbias=None):

assert scf_require is not None, "scf_require should be set to True or False"
Expand Down Expand Up @@ -704,7 +749,10 @@ def negf_compute(self,scf_require=False,Vbias=None):
self.out.setdefault('LDOS', {}).setdefault(str(k), []).append(self.compute_LDOS(k))
else:
# Non-SCF: solve a whole chunk of energies in one batched recursive_gf call.
chunk = self.e_batch_size if self.e_batch_size is not None else len(self.uni_grid)
if self.e_batch_size is not None:
chunk = self.e_batch_size
else:
chunk = self._auto_chunk_size(len(self.uni_grid))
for e_chunk in torch.split(self.uni_grid, chunk):
e_batch_size = len(e_chunk)
log.info(
Expand Down Expand Up @@ -741,19 +789,31 @@ def negf_compute(self,scf_require=False,Vbias=None):
)

if self.out_dos:
self.out.setdefault('DOS', {}).setdefault(str(k), []).append(self.compute_DOS(k).reshape(-1))
self.out.setdefault('DOS', {}).setdefault(str(k), []).append(self.compute_DOS(k).reshape(-1).cpu())
if self.out_tc or self.out_current_nscf:
self.out.setdefault('T_k', {}).setdefault(str(k), []).append(self.compute_TC(k).reshape(-1))
self.out.setdefault('T_k', {}).setdefault(str(k), []).append(self.compute_TC(k).reshape(-1).cpu())
if self.out_ldos:
ldos_chunk = self.compute_LDOS(k)
if ldos_chunk.ndim == 1: # scalar-E chunk → [na]
ldos_chunk = ldos_chunk.unsqueeze(0)
self.out.setdefault('LDOS', {}).setdefault(str(k), []).append(ldos_chunk)

# Restore lead.se to scalar [n,n] so downstream scalar callers
# (density modules, lcurrent loop, future SCF re-entry) see the expected shape.
self.deviceprop.lead_L.se = seL_list[-1]
self.deviceprop.lead_R.se = seR_list[-1]
self.out.setdefault('LDOS', {}).setdefault(str(k), []).append(ldos_chunk.cpu())

# Restore lead.se to a scalar [n,n] before releasing the GF
# dict. For B>1 we clone the last per-E tensor so the new
# lead.se doesn't share storage with anything still
# referenced through seL_list/seR_list, then drop both
# lists so release_greenfuncs's empty_cache() has the per-E
# and stacked [B,n,n] copies to release.
if e_batch_size > 1:
self.deviceprop.lead_L.se = seL_list[-1].detach().clone()
self.deviceprop.lead_R.se = seR_list[-1].detach().clone()
else:
# B=1 path: lead.se already IS the per-E [n,n] tensor;
# preserve byte-identical behavior for the scalar case.
self.deviceprop.lead_L.se = seL_list[-1]
self.deviceprop.lead_R.se = seR_list[-1]
del seL_list, seR_list
self.deviceprop.release_greenfuncs()

# over energy loop in uni_gird
# The following code is for output properties before NEGF ends
Expand Down
Loading