From 30cf672070bacb48f520044f42e63773755b81ef Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 13:33:57 +0000 Subject: [PATCH 001/242] first version of pobtas streaming --- src/serinv/algs/pobtas.py | 214 +++++++++++++++++- .../regular/tests_bta/test_pobtas.py | 27 ++- 2 files changed, 238 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index bab2a911..e0c87b82 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -4,6 +4,7 @@ from serinv import ( ArrayLike, _get_module_from_array, + _get_module_from_str, ) @@ -47,8 +48,14 @@ def pobtas( else: # Natural arrowhead if device_streaming: - raise NotImplementedError( - "Streaming is not implemented for the natural arrowhead." + _pobtas_streaming( + L_diagonal_blocks, + L_lower_diagonal_blocks, + L_lower_arrow_blocks, + L_arrow_tip_block, + B, + trans, + partial, ) else: _pobtas( @@ -216,3 +223,206 @@ def _pobtas_permuted( ) else: raise ValueError(f"Invalid transpose argument: {trans}.") + +def _pobtas_streaming( + L_diagonal_blocks: ArrayLike, + L_lower_diagonal_blocks: ArrayLike, + L_lower_arrow_blocks: ArrayLike, + L_arrow_tip_block: ArrayLike, + B: ArrayLike, + trans: str, + partial: bool, +): + arr_module, _ = _get_module_from_array(arr=L_diagonal_blocks) + if arr_module.__name__ != "numpy": + raise NotImplementedError( + "Host<->Device streaming only works when host-arrays are given." + ) + + cp, cu_la = _get_module_from_str(module_str="cupy") + + # Streams and events + compute_stream = cp.cuda.Stream(non_blocking=True) + h2d_stream = cp.cuda.Stream(non_blocking=True) + d2h_stream = cp.cuda.Stream(non_blocking=True) + + h2d_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_lower_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_B_events = [cp.cuda.Event(), cp.cuda.Event()] + + d2h_B_events = [cp.cuda.Event(), cp.cuda.Event()] + + compute_current_B_events = [cp.cuda.Event(), cp.cuda.Event()] + compute_next_B_events = [cp.cuda.Event(), cp.cuda.Event()] + compute_arrow_B_events = [cp.cuda.Event(), cp.cuda.Event()] + + compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] + + #compute_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] + #compute_arrow_h2d_events = [cp.cuda.Event(), cp.cuda.Event()] + #compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] + #compute_B_h2d_events = [cp.cuda.Event(), cp.cuda.Event()] + + # Vars + diag_blocksize = L_diagonal_blocks.shape[1] + arrow_blocksize = L_lower_arrow_blocks.shape[1] + n_diag_blocks = L_diagonal_blocks.shape[0] + + # Device Buffers + # B Buffers + B_shape = B[0 : diag_blocksize] # block template + B_d = cp.empty( + (2, *B_shape.shape[1:]), dtype=B_shape.dtype + ) + B_shape = B[-arrow_blocksize:] + B_last_block_d = cp.empty_like(B_shape) + del B_shape + + # L Buffers + L_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_arrow_blocks_d = cp.empty( + (2, *L_lower_arrow_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) + + # Forward Pass + # --- C: events + transfers --- + compute_current_B_events[1].record(stream=compute_stream) + compute_next_B_events[1].record(stream=compute_stream) + compute_arrow_B_events[1].record(stream=compute_stream) + + B_last_block_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) + L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:, :], stream=h2d_stream) + + # --- H2D: transfers --- + B_d[0].set(arr=B[0 : 1 * diag_blocksize], stream = h2d_stream) + h2d_B_events[0].record(stream=h2d_stream) + + L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) + h2d_diagonal_events[0].record(stream=h2d_stream) + + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + h2d_lower_diagonal_events[0].record(stream=h2d_stream) + + L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[0], stream=h2d_stream) + h2d_arrow_events[0].record(stream=h2d_stream) + + # --- D2H: event --- + d2h_B_events[1].record(stream=d2h_stream) + + n_diag_blocks: int = L_diagonal_blocks.shape[0] # why? + if n_diag_blocks > 1: + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + h2d_lower_diagonal_events[0].record(stream=h2d_stream) + + + + if trans == "N": + for i in range(0, n_diag_blocks-1): + # --- Forward substitution --- + with compute_stream: + # Compute step 1 : compute B + compute_stream.wait_event(h2d_diagonal_events[i % 2]) + compute_stream.wait_event(compute_arrow_B_events[i % 2]) + compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) + B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( + L_diagonal_blocks[i % 2], + B[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], + lower=True, + ) + compute_current_B_events[i % 2].record(stream=compute_stream) + + h2d_stream.wait_event(compute_current_B_events[i % 2]) + L_diagonal_blocks_d[(i + 2) % 2].set(arr=L_diagonal_blocks[i + 2], stream=h2d_stream) + h2d_diagonal_events[i % 2].record(stream=h2d_stream) + + d2h_stream.wait_event(compute_next_B_events[i % 2]) + B_d[i % 2].get( + out=B[i * diag_blocksize : (i + 1) * diag_blocksize], + stream=d2h_stream, + blocking=False, + ) + d2h_B_events[i % 2].record(stream=d2h_stream) + + with compute_stream: + # 2 + compute_stream.wait_event(h2d_lower_diagonal_events[i % 2]) + compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) + compute_stream.wait_event(compute_current_B_events[i % 2]) + compute_stream.wait_event(compute_next_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2 * diag_blocksize : (i + 2) % 2 * diag_blocksize] -= ( + L_lower_diagonal_blocks[i%2] + @ B[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] + ) + compute_next_B_events[i % 2].record(stream=compute_stream) + + h2d_stream.wait_event(compute_next_B_events[i % 2]) + L_lower_diagonal_blocks_d[(i + 2) % 2].set(arr=L_lower_diagonal_blocks[i + 2], stream=h2d_stream) + h2d_lower_diagonal_events[i % 2].record(stream=h2d_stream) + + with compute_stream: + # 3 + compute_stream.wait_event(h2d_arrow_events[i % 2]) + compute_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) + compute_stream.wait_event(compute_next_B_events[i % 2]) + B_last_block_d -= ( + L_lower_arrow_blocks_d[i % 2] + @ B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] + ) + compute_arrow_B_events[i % 2].record(stream=compute_stream) + + h2d_stream.wait_event(compute_arrow_B_events[i % 2]) + B_d[(i + 2) % 2].set(arr=B[(i + 2) * diag_blocksize : (i + 3) * diag_blocksize], stream = h2d_stream) + h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + + L_lower_arrow_blocks_d[(i + 1) % 2].set(arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream) + h2d_arrow_events[i % 2].record(stream=h2d_stream) + + + if not partial: + # In the case of the partial solve, we do not solve the last block and + # arrow tip block of the RHS. + + L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) + h2d_diagonal_events[0].record(stream=h2d_stream) + + L_lower_arrow_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) + h2d_arrow_events[0].record(stream=h2d_stream) + + + with compute_stream: + + compute_stream.wait_event(h2d_diagonal_events[0]) + B_last_block_d = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) + compute_partial_events[0].record(stream=compute_stream) + + compute_stream.wait_event(h2d_arrow_events[0]) + compute_stream.wait_event(compute_partial_events[0]) + B_last_block_d -= (L_lower_arrow_blocks_d[-1] @ B_last_block_d[1]) + compute_partial_events[1].record(stream=compute_stream) + + d2h_stream.wait_event(compute_partial_events[1]) + B_d[i % 2].get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + + # Y_{ndb+1} = L_{ndb+1,ndb+1}^{-1} (B_{ndb+1} - \Sigma_{i=1}^{ndb} L_{ndb+1,i} Y_{i) + + elif trans == "T" or trans == "C": + # ----- Backward substitution ----- + if not partial: + # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) + raise NotImplementedError( + "T and C not yet implemented." + ) + # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) + + # for i in range(n_diag_blocks -2, -1, -1): + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + + else: + raise ValueError(f"Invalid transpose argument: {trans}.") \ No newline at end of file diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 647a0168..b8810c77 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -3,11 +3,14 @@ import numpy as np import pytest -from serinv import _get_module_from_array +from serinv import backend_flags, _get_module_from_array from ....testing_utils import bta_dense_to_arrays, dd_bta, symmetrize, rhs from serinv.algs import pobtaf, pobtas +if backend_flags["cupy_avail"]: + import cupyx as cpx + @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) @@ -19,6 +22,9 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): + + array_type = "streaming" + A = dd_bta( diagonal_blocksize, arrowhead_blocksize, @@ -51,6 +57,24 @@ def test_pobtas( A_arrow_tip_block, ) = bta_dense_to_arrays(A, diagonal_blocksize, arrowhead_blocksize, n_diag_blocks) + if backend_flags["cupy_avail"] and array_type == "streaming": + A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks) + A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks[:, :, :] + A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks) + A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks[:, :, :] + A_lower_arrow_blocks_pinned = cpx.zeros_like_pinned(A_lower_arrow_blocks) + A_lower_arrow_blocks_pinned[:, :, :] = A_lower_arrow_blocks[:, :, :] + A_arrow_tip_block_pinned = cpx.zeros_like_pinned(A_arrow_tip_block) + A_arrow_tip_block_pinned[:, :] = A_arrow_tip_block[:, :] + B_pinned = cpx.zeros_like_pinned(B) + B_pinned[:, :] = B[:, :] + + A_diagonal_blocks = A_diagonal_blocks_pinned + A_lower_diagonal_blocks = A_lower_diagonal_blocks_pinned + A_lower_arrow_blocks = A_lower_arrow_blocks_pinned + A_arrow_tip_block = A_arrow_tip_block_pinned + B = B_pinned + pobtaf( A_diagonal_blocks, A_lower_diagonal_blocks, @@ -66,6 +90,7 @@ def test_pobtas( A_arrow_tip_block, B, trans="N", + device_streaming=True if array_type == "streaming" else False, ) # Backward solve: X=L^{-T}Y From 9e97c5e5030d65c26a8c27145c1fab0a1658307e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 13:36:35 +0000 Subject: [PATCH 002/242] change tests incase that broke it --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index b8810c77..94f73c2a 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -23,7 +23,6 @@ def test_pobtas( dtype: np.dtype, ): - array_type = "streaming" A = dd_bta( diagonal_blocksize, From 380a08492aa3cc1777cb74ef90020e773ae3d85f Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 13:38:32 +0000 Subject: [PATCH 003/242] test update --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 94f73c2a..ce9ac254 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -22,7 +22,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - + array_type == "streaming" A = dd_bta( diagonal_blocksize, From 320ebef0661e40c7ac5df949c62e6b7b6c950981 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 13:43:10 +0000 Subject: [PATCH 004/242] typo --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index ce9ac254..cbb7aaed 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -22,7 +22,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - array_type == "streaming" + array_type = "streaming" A = dd_bta( diagonal_blocksize, From 6eeb0f9c58cfc635cbd86b317eccd155b2d9b463 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:23:36 +0000 Subject: [PATCH 005/242] debug statements --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index e0c87b82..b3b4ee4a 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -272,9 +272,11 @@ def _pobtas_streaming( # Device Buffers # B Buffers B_shape = B[0 : diag_blocksize] # block template + print(B_shape) B_d = cp.empty( (2, *B_shape.shape[1:]), dtype=B_shape.dtype ) + print(B_d) B_shape = B[-arrow_blocksize:] B_last_block_d = cp.empty_like(B_shape) del B_shape From 137112fb0d8e5acafa3b5e595122ab498938399f Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:24:55 +0000 Subject: [PATCH 006/242] debug changes --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index b3b4ee4a..e50e05f9 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -272,10 +272,12 @@ def _pobtas_streaming( # Device Buffers # B Buffers B_shape = B[0 : diag_blocksize] # block template + print("B_shape") print(B_shape) B_d = cp.empty( (2, *B_shape.shape[1:]), dtype=B_shape.dtype ) + print("B_d") print(B_d) B_shape = B[-arrow_blocksize:] B_last_block_d = cp.empty_like(B_shape) From dfab4ab20897451211e050eddadff65599bda3ba Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:27:13 +0000 Subject: [PATCH 007/242] debug messages --- src/serinv/algs/pobtas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index e50e05f9..8b45d3d9 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -305,6 +305,10 @@ def _pobtas_streaming( L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:, :], stream=h2d_stream) # --- H2D: transfers --- + print("B block") + print(B[0 : 1 * diag_blocksize]) + print("B_d 0") + print(B_d[0]) B_d[0].set(arr=B[0 : 1 * diag_blocksize], stream = h2d_stream) h2d_B_events[0].record(stream=h2d_stream) From e068b847d0e7c191a83be0f154d99725fec29745 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:30:42 +0000 Subject: [PATCH 008/242] print B --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 8b45d3d9..c4ac087e 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -307,6 +307,8 @@ def _pobtas_streaming( # --- H2D: transfers --- print("B block") print(B[0 : 1 * diag_blocksize]) + print("B") + print(B) print("B_d 0") print(B_d[0]) B_d[0].set(arr=B[0 : 1 * diag_blocksize], stream = h2d_stream) From 2981b3edca9105708d47d44f0053e32e92c50a1c Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:31:52 +0000 Subject: [PATCH 009/242] changed B_d shape --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index c4ac087e..0c1c0d64 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -275,7 +275,7 @@ def _pobtas_streaming( print("B_shape") print(B_shape) B_d = cp.empty( - (2, *B_shape.shape[1:]), dtype=B_shape.dtype + (2, *B_shape.shape), dtype=B_shape.dtype ) print("B_d") print(B_d) From 29c674fe4972716cf9c34906010cdb4efb5e73b6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:38:55 +0000 Subject: [PATCH 010/242] changed wrong arrays in streaming --- src/serinv/algs/pobtas.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0c1c0d64..3035e889 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -259,11 +259,6 @@ def _pobtas_streaming( compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] - #compute_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] - #compute_arrow_h2d_events = [cp.cuda.Event(), cp.cuda.Event()] - #compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] - #compute_B_h2d_events = [cp.cuda.Event(), cp.cuda.Event()] - # Vars diag_blocksize = L_diagonal_blocks.shape[1] arrow_blocksize = L_lower_arrow_blocks.shape[1] @@ -272,13 +267,9 @@ def _pobtas_streaming( # Device Buffers # B Buffers B_shape = B[0 : diag_blocksize] # block template - print("B_shape") - print(B_shape) B_d = cp.empty( (2, *B_shape.shape), dtype=B_shape.dtype ) - print("B_d") - print(B_d) B_shape = B[-arrow_blocksize:] B_last_block_d = cp.empty_like(B_shape) del B_shape @@ -305,12 +296,6 @@ def _pobtas_streaming( L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:, :], stream=h2d_stream) # --- H2D: transfers --- - print("B block") - print(B[0 : 1 * diag_blocksize]) - print("B") - print(B) - print("B_d 0") - print(B_d[0]) B_d[0].set(arr=B[0 : 1 * diag_blocksize], stream = h2d_stream) h2d_B_events[0].record(stream=h2d_stream) @@ -331,7 +316,6 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_lower_diagonal_events[0].record(stream=h2d_stream) - if trans == "N": for i in range(0, n_diag_blocks-1): @@ -342,8 +326,8 @@ def _pobtas_streaming( compute_stream.wait_event(compute_arrow_B_events[i % 2]) compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( - L_diagonal_blocks[i % 2], - B[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], + L_diagonal_blocks_d[i % 2], + B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], lower=True, ) compute_current_B_events[i % 2].record(stream=compute_stream) From 3bffe7d8ab696fa83c7a24be492868e8fa1944fc Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:41:45 +0000 Subject: [PATCH 011/242] debug shapes --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 3035e889..33e4aea3 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -325,6 +325,8 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_diagonal_events[i % 2]) compute_stream.wait_event(compute_arrow_B_events[i % 2]) compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) + print(B_d.shape()) + print(L_diagonal_blocks_d.shape()) B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], From 1e8acadb0d74e1c619ca05404ea7b59775374443 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:43:22 +0000 Subject: [PATCH 012/242] typo --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 33e4aea3..07e01c89 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -325,8 +325,8 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_diagonal_events[i % 2]) compute_stream.wait_event(compute_arrow_B_events[i % 2]) compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) - print(B_d.shape()) - print(L_diagonal_blocks_d.shape()) + print(B_d.shape) + print(L_diagonal_blocks_d.shape) B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], From ec0a01b0fc037f4eab2f751808ce5b0ab2abc71e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:45:30 +0000 Subject: [PATCH 013/242] compare B and L --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 07e01c89..a2dca781 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -325,8 +325,8 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_diagonal_events[i % 2]) compute_stream.wait_event(compute_arrow_B_events[i % 2]) compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) - print(B_d.shape) - print(L_diagonal_blocks_d.shape) + print(B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize].shape) + print(L_diagonal_blocks_d[i % 2].shape) B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], From 6a657cc6fac80f3ec1e1968a3741a8ff003eabb9 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:51:36 +0000 Subject: [PATCH 014/242] changed B slice in 1 --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index a2dca781..7562555c 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -325,7 +325,7 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_diagonal_events[i % 2]) compute_stream.wait_event(compute_arrow_B_events[i % 2]) compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) - print(B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize].shape) + print(B_d[i % 2].shape) print(L_diagonal_blocks_d[i % 2].shape) B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], From c3bb244ce94fb56ddcc49603ba9614ce93327652 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:52:35 +0000 Subject: [PATCH 015/242] changed actual B slices --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 7562555c..008d99fc 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -327,9 +327,9 @@ def _pobtas_streaming( compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) print(B_d[i % 2].shape) print(L_diagonal_blocks_d[i % 2].shape) - B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] = cu_la.solve_triangular( + B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], - B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize], + B_d[i % 2], lower=True, ) compute_current_B_events[i % 2].record(stream=compute_stream) From 11a9cc63b7af57d08f333910a0f7bc9ccce1b952 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 14:54:31 +0000 Subject: [PATCH 016/242] changed further B slice --- src/serinv/algs/pobtas.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 008d99fc..6c1a259f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -325,8 +325,6 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_diagonal_events[i % 2]) compute_stream.wait_event(compute_arrow_B_events[i % 2]) compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) - print(B_d[i % 2].shape) - print(L_diagonal_blocks_d[i % 2].shape) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2], @@ -352,9 +350,9 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) compute_stream.wait_event(compute_current_B_events[i % 2]) compute_stream.wait_event(compute_next_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2 * diag_blocksize : (i + 2) % 2 * diag_blocksize] -= ( - L_lower_diagonal_blocks[i%2] - @ B[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] + B_d[(i + 1) % 2] -= ( + L_lower_diagonal_blocks[i % 2] + @ B[i % 2] ) compute_next_B_events[i % 2].record(stream=compute_stream) From 935848de17603d7bb4f91b2e00093db143157cd3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:00:16 +0000 Subject: [PATCH 017/242] fixed typos --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 6c1a259f..4647eeda 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -351,8 +351,8 @@ def _pobtas_streaming( compute_stream.wait_event(compute_current_B_events[i % 2]) compute_stream.wait_event(compute_next_B_events[(i + 1) % 2]) B_d[(i + 1) % 2] -= ( - L_lower_diagonal_blocks[i % 2] - @ B[i % 2] + L_lower_diagonal_blocks_d[i % 2] + @ B_d[i % 2] ) compute_next_B_events[i % 2].record(stream=compute_stream) From f527a69a5bfc1b9041c8db5b7cb79f96119ff2c5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:01:36 +0000 Subject: [PATCH 018/242] changed last B slice --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 4647eeda..45271abd 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -367,7 +367,7 @@ def _pobtas_streaming( compute_stream.wait_event(compute_next_B_events[i % 2]) B_last_block_d -= ( L_lower_arrow_blocks_d[i % 2] - @ B_d[i % 2 * diag_blocksize : (i + 1) % 2 * diag_blocksize] + @ B_d[i % 2] ) compute_arrow_B_events[i % 2].record(stream=compute_stream) From 69fc9a064f66f81f859e421e2bd559532f4942de Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:17:24 +0000 Subject: [PATCH 019/242] changed index for lower diag blocks --- src/serinv/algs/pobtas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 45271abd..1420c97e 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -302,8 +302,8 @@ def _pobtas_streaming( L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) h2d_diagonal_events[0].record(stream=h2d_stream) - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - h2d_lower_diagonal_events[0].record(stream=h2d_stream) + #L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + #h2d_lower_diagonal_events[0].record(stream=h2d_stream) L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[0], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) @@ -318,7 +318,7 @@ def _pobtas_streaming( if trans == "N": - for i in range(0, n_diag_blocks-1): + for i in range(0, n_diag_blocks - 1): # --- Forward substitution --- with compute_stream: # Compute step 1 : compute B @@ -357,7 +357,7 @@ def _pobtas_streaming( compute_next_B_events[i % 2].record(stream=compute_stream) h2d_stream.wait_event(compute_next_B_events[i % 2]) - L_lower_diagonal_blocks_d[(i + 2) % 2].set(arr=L_lower_diagonal_blocks[i + 2], stream=h2d_stream) + L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) h2d_lower_diagonal_events[i % 2].record(stream=h2d_stream) with compute_stream: From 467ce64abfd2bd81cc79785e21860757ade295f2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:18:54 +0000 Subject: [PATCH 020/242] changed index for diagonal blocks --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 1420c97e..7284ee28 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -333,7 +333,7 @@ def _pobtas_streaming( compute_current_B_events[i % 2].record(stream=compute_stream) h2d_stream.wait_event(compute_current_B_events[i % 2]) - L_diagonal_blocks_d[(i + 2) % 2].set(arr=L_diagonal_blocks[i + 2], stream=h2d_stream) + L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) h2d_diagonal_events[i % 2].record(stream=h2d_stream) d2h_stream.wait_event(compute_next_B_events[i % 2]) From aa5c893c3fde04cc120eb55dcca23c761746a32e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:32:28 +0000 Subject: [PATCH 021/242] inserted ifs for termination --- src/serinv/algs/pobtas.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 7284ee28..27e1412b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -313,9 +313,18 @@ def _pobtas_streaming( n_diag_blocks: int = L_diagonal_blocks.shape[0] # why? if n_diag_blocks > 1: + B_d[1].set(arr=B[1 * diag_blocksize : 2 * diag_blocksize], stream = h2d_stream) + h2d_B_events[1].record(stream=h2d_stream) + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_lower_diagonal_events[0].record(stream=h2d_stream) + L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) + h2d_diagonal_events[1].record(stream=h2d_stream) + + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) + h2d_lower_diagonal_events[1].record(stream=h2d_stream) + if trans == "N": for i in range(0, n_diag_blocks - 1): @@ -332,9 +341,10 @@ def _pobtas_streaming( ) compute_current_B_events[i % 2].record(stream=compute_stream) - h2d_stream.wait_event(compute_current_B_events[i % 2]) - L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) - h2d_diagonal_events[i % 2].record(stream=h2d_stream) + if i + 2 < n_diag_blocks - 1: + h2d_stream.wait_event(compute_current_B_events[i % 2]) + L_diagonal_blocks_d[(i + 2) % 2].set(arr=L_diagonal_blocks[i + 2], stream=h2d_stream) + h2d_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) d2h_stream.wait_event(compute_next_B_events[i % 2]) B_d[i % 2].get( @@ -356,9 +366,10 @@ def _pobtas_streaming( ) compute_next_B_events[i % 2].record(stream=compute_stream) - h2d_stream.wait_event(compute_next_B_events[i % 2]) - L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) - h2d_lower_diagonal_events[i % 2].record(stream=h2d_stream) + if i + 2 < n_diag_blocks - 1: + h2d_stream.wait_event(compute_next_B_events[i % 2]) + L_lower_diagonal_blocks_d[(i + 2) % 2].set(arr=L_lower_diagonal_blocks[i + 2], stream=h2d_stream) + h2d_lower_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) with compute_stream: # 3 @@ -371,12 +382,13 @@ def _pobtas_streaming( ) compute_arrow_B_events[i % 2].record(stream=compute_stream) - h2d_stream.wait_event(compute_arrow_B_events[i % 2]) - B_d[(i + 2) % 2].set(arr=B[(i + 2) * diag_blocksize : (i + 3) * diag_blocksize], stream = h2d_stream) - h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + if i + 2 < n_diag_blocks - 1: + h2d_stream.wait_event(compute_arrow_B_events[i % 2]) + B_d[(i + 2) % 2].set(arr=B[(i + 2) * diag_blocksize : (i + 3) * diag_blocksize], stream = h2d_stream) + h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) - L_lower_arrow_blocks_d[(i + 1) % 2].set(arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream) - h2d_arrow_events[i % 2].record(stream=h2d_stream) + L_lower_arrow_blocks_d[(i + 1) % 2].set(arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream) + h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) if not partial: From 5ab5cf902b457740ac79d42ba359c3ca4b35b10e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:33:53 +0000 Subject: [PATCH 022/242] fixed typo --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 27e1412b..ad4ca3de 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -322,7 +322,7 @@ def _pobtas_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) h2d_diagonal_events[1].record(stream=h2d_stream) - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[1], stream=h2d_stream) h2d_lower_diagonal_events[1].record(stream=h2d_stream) From 069f355faa868d532b39e25d493ff791044bb49b Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:35:59 +0000 Subject: [PATCH 023/242] fixed typo --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index ad4ca3de..5972ea1c 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -410,7 +410,7 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_arrow_events[0]) compute_stream.wait_event(compute_partial_events[0]) - B_last_block_d -= (L_lower_arrow_blocks_d[-1] @ B_last_block_d[1]) + B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d[1]) compute_partial_events[1].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[1]) From e15278663d7a811d3cb3f322ef0d5b57b9c9bd10 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:37:59 +0000 Subject: [PATCH 024/242] insert debug prints --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 5972ea1c..38dbc43a 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -410,6 +410,8 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_arrow_events[0]) compute_stream.wait_event(compute_partial_events[0]) + print(B_last_block_d.shape) + print(L-L_lower_arrow_blocks_d.shape) B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d[1]) compute_partial_events[1].record(stream=compute_stream) From f932f67e68f2c9f3bab443053f5a0b9202312061 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:38:33 +0000 Subject: [PATCH 025/242] typo --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 38dbc43a..f0310332 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -411,7 +411,7 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_arrow_events[0]) compute_stream.wait_event(compute_partial_events[0]) print(B_last_block_d.shape) - print(L-L_lower_arrow_blocks_d.shape) + print(L_lower_arrow_blocks_d.shape) B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d[1]) compute_partial_events[1].record(stream=compute_stream) From 730de95f43c3115f2618cadabe4b49e249e58450 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:40:09 +0000 Subject: [PATCH 026/242] changed b last block --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f0310332..55e089c8 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -412,7 +412,7 @@ def _pobtas_streaming( compute_stream.wait_event(compute_partial_events[0]) print(B_last_block_d.shape) print(L_lower_arrow_blocks_d.shape) - B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d[1]) + B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d) compute_partial_events[1].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[1]) From b61f8644df2b750ed666ffde0941fe5ed0775740 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:43:41 +0000 Subject: [PATCH 027/242] fixed lower arrow blocks in partial --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 55e089c8..dd2dd6a1 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -398,7 +398,7 @@ def _pobtas_streaming( L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) h2d_diagonal_events[0].record(stream=h2d_stream) - L_lower_arrow_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) + L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) From 81a063f610b6b76d51228b57d2d90a81157ef6bd Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:48:33 +0000 Subject: [PATCH 028/242] changed typo --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index dd2dd6a1..592000b4 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -416,7 +416,7 @@ def _pobtas_streaming( compute_partial_events[1].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[1]) - B_d[i % 2].get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_last_block_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) # Y_{ndb+1} = L_{ndb+1,ndb+1}^{-1} (B_{ndb+1} - \Sigma_{i=1}^{ndb} L_{ndb+1,i} Y_{i) From 51acd9aea4bbae5be647ccf9e9b38e013ae9bfe2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:49:49 +0000 Subject: [PATCH 029/242] changed test --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index cbb7aaed..b6bf238d 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -22,7 +22,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - array_type = "streaming" + #array_type = "streaming" A = dd_bta( diagonal_blocksize, From 66a23f8389c0420de35995bd3dee995b3d89d0c6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 24 Apr 2025 15:52:25 +0000 Subject: [PATCH 030/242] new debug print --- src/serinv/algs/pobtas.py | 1 + tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 592000b4..95bc5552 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -412,6 +412,7 @@ def _pobtas_streaming( compute_stream.wait_event(compute_partial_events[0]) print(B_last_block_d.shape) print(L_lower_arrow_blocks_d.shape) + print(L_lower_arrow_blocks_d[1] @ B_last_block_d) B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d) compute_partial_events[1].record(stream=compute_stream) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index b6bf238d..cbb7aaed 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -22,7 +22,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - #array_type = "streaming" + array_type = "streaming" A = dd_bta( diagonal_blocksize, From 7a456a58ca15f606a61c54f998ecaa1a43e61ef8 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:26:44 +0000 Subject: [PATCH 031/242] changed logic to accomodate arrow sizes --- src/serinv/algs/pobtas.py | 84 ++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 14 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 95bc5552..f289ced2 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -250,8 +250,10 @@ def _pobtas_streaming( h2d_lower_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_B_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_tip_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_B_events = [cp.cuda.Event(), cp.cuda.Event()] + d2h_tip_events = [cp.cuda.Event(), cp.cuda.Event()] compute_current_B_events = [cp.cuda.Event(), cp.cuda.Event()] compute_next_B_events = [cp.cuda.Event(), cp.cuda.Event()] @@ -271,7 +273,7 @@ def _pobtas_streaming( (2, *B_shape.shape), dtype=B_shape.dtype ) B_shape = B[-arrow_blocksize:] - B_last_block_d = cp.empty_like(B_shape) + B_arrow_tip_d = cp.empty_like(B_shape) del B_shape # L Buffers @@ -292,7 +294,7 @@ def _pobtas_streaming( compute_next_B_events[1].record(stream=compute_stream) compute_arrow_B_events[1].record(stream=compute_stream) - B_last_block_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:, :], stream=h2d_stream) # --- H2D: transfers --- @@ -310,6 +312,7 @@ def _pobtas_streaming( # --- D2H: event --- d2h_B_events[1].record(stream=d2h_stream) + n_diag_blocks: int = L_diagonal_blocks.shape[0] # why? if n_diag_blocks > 1: @@ -345,14 +348,18 @@ def _pobtas_streaming( h2d_stream.wait_event(compute_current_B_events[i % 2]) L_diagonal_blocks_d[(i + 2) % 2].set(arr=L_diagonal_blocks[i + 2], stream=h2d_stream) h2d_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) + if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): + B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream,) - d2h_stream.wait_event(compute_next_B_events[i % 2]) + d2h_stream.wait_event(compute_current_B_events[i % 2]) B_d[i % 2].get( out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False, ) d2h_B_events[i % 2].record(stream=d2h_stream) + + with compute_stream: # 2 @@ -370,22 +377,54 @@ def _pobtas_streaming( h2d_stream.wait_event(compute_next_B_events[i % 2]) L_lower_diagonal_blocks_d[(i + 2) % 2].set(arr=L_lower_diagonal_blocks[i + 2], stream=h2d_stream) h2d_lower_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) + + if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): + d2h_stream.wait_event(compute_next_B_events[i % 2]) + B_d[(i + 1) % 2].get( + out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream=d2h_stream, + blocking=False, + ) + d2h_B_events[(i + 1) % 2].record(stream=d2h_stream) + + h2d_stream.wait_event(d2h_B_events[(i + 1) % 2]) + B_arrow_tip_d.set(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + h2d_tip_events[i % 2].record(stream=h2d_stream) + with compute_stream: # 3 compute_stream.wait_event(h2d_arrow_events[i % 2]) compute_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) compute_stream.wait_event(compute_next_B_events[i % 2]) - B_last_block_d -= ( + if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): + compute_stream.wait_event(h2d_tip_events[i % 2]) + + B_arrow_tip_d -= ( L_lower_arrow_blocks_d[i % 2] @ B_d[i % 2] ) + compute_arrow_B_events[i % 2].record(stream=compute_stream) + - if i + 2 < n_diag_blocks - 1: + # make sure that arrowtip and B overlap gets resolved + if ((i + 3) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): h2d_stream.wait_event(compute_arrow_B_events[i % 2]) B_d[(i + 2) % 2].set(arr=B[(i + 2) * diag_blocksize : (i + 3) * diag_blocksize], stream = h2d_stream) - h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + h2d_B_events[(i + 2) % 2].record(stream=h2d_stream) + + else: + d2h_stream.wait_event(compute_arrow_B_events[i % 2]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + d2h_tip_events[i % 2].record(stream=d2h_stream) + + if i + 1 < n_diag_blocks - 1: + B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream = h2d_stream) + h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + + + if i + 2 < n_diag_blocks - 1: L_lower_arrow_blocks_d[(i + 1) % 2].set(arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream) h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) @@ -395,31 +434,48 @@ def _pobtas_streaming( # In the case of the partial solve, we do not solve the last block and # arrow tip block of the RHS. + h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) h2d_diagonal_events[0].record(stream=h2d_stream) + B_d[0].set(arr=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=h2d_stream,) + h2d_B_events[0].record(stream=h2d_stream) + L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) + with compute_stream: compute_stream.wait_event(h2d_diagonal_events[0]) - B_last_block_d = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) + compute_stream.wait_event(h2d_B_events[0]) + B_d = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) compute_partial_events[0].record(stream=compute_stream) + d2h_stream.wait_event(compute_partial_events[0]) + B_d[0].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) + d2h_B_events[0].record(stream=d2h_stream) + + h2d_stream.wait_event(d2h_B_events[0]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + h2d_tip_events[0].record(stream=h2d_stream) + + with compute_stream: compute_stream.wait_event(h2d_arrow_events[0]) + compute_stream.wait_event(h2d_tip_events[0]) compute_stream.wait_event(compute_partial_events[0]) - print(B_last_block_d.shape) - print(L_lower_arrow_blocks_d.shape) - print(L_lower_arrow_blocks_d[1] @ B_last_block_d) - B_last_block_d -= (L_lower_arrow_blocks_d[1] @ B_last_block_d) + + B_arrow_tip_d -= (L_lower_arrow_blocks_d[1] @ B_arrow_tip_d) compute_partial_events[1].record(stream=compute_stream) - d2h_stream.wait_event(compute_partial_events[1]) - B_last_block_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + compute_stream.wait_event(compute_partial_events[1]) + B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) + compute_partial_events[0].record(stream=compute_stream) + + d2h_stream.wait_event(compute_partial_events[0]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - # Y_{ndb+1} = L_{ndb+1,ndb+1}^{-1} (B_{ndb+1} - \Sigma_{i=1}^{ndb} L_{ndb+1,i} Y_{i) elif trans == "T" or trans == "C": # ----- Backward substitution ----- From b7fa179c9ba859daef544eb06f117dafadf17cd4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:29:18 +0000 Subject: [PATCH 032/242] typo --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f289ced2..1219e182 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -388,7 +388,7 @@ def _pobtas_streaming( d2h_B_events[(i + 1) % 2].record(stream=d2h_stream) h2d_stream.wait_event(d2h_B_events[(i + 1) % 2]) - B_arrow_tip_d.set(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) h2d_tip_events[i % 2].record(stream=h2d_stream) From 1d550cd6ff091b701606cd6908c6919c511d11be Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:29:53 +0000 Subject: [PATCH 033/242] fixed function --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 1219e182..df1d4814 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -388,7 +388,7 @@ def _pobtas_streaming( d2h_B_events[(i + 1) % 2].record(stream=d2h_stream) h2d_stream.wait_event(d2h_B_events[(i + 1) % 2]) - B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream,) h2d_tip_events[i % 2].record(stream=h2d_stream) From 8c221e880c0574e7efd15c85c5da1e67857b2901 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:32:25 +0000 Subject: [PATCH 034/242] insert debug statements --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index df1d4814..33847c30 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -454,6 +454,8 @@ def _pobtas_streaming( compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) + print(B_d[0]) + print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) B_d[0].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) d2h_B_events[0].record(stream=d2h_stream) From 380340e2374adc18320757329951eb31193ed9e6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:35:39 +0000 Subject: [PATCH 035/242] more debugging --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 33847c30..861952f8 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -439,6 +439,7 @@ def _pobtas_streaming( h2d_diagonal_events[0].record(stream=h2d_stream) B_d[0].set(arr=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=h2d_stream,) + print(B_d[0]) h2d_B_events[0].record(stream=h2d_stream) L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) From d41e4e916a786dfba86c69f702ba46e144b3dca3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:39:47 +0000 Subject: [PATCH 036/242] debugging second to last solve --- src/serinv/algs/pobtas.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 861952f8..b767b300 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -445,7 +445,18 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) - + B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize] = ( + la.solve_triangular( + L_diagonal_blocks[n_diag_blocks - 1], + B[ + (n_diag_blocks - 1) + * diag_blocksize : n_diag_blocks + * diag_blocksize + ], + lower=True, + ) + ) + print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) with compute_stream: @@ -461,7 +472,7 @@ def _pobtas_streaming( d2h_B_events[0].record(stream=d2h_stream) h2d_stream.wait_event(d2h_B_events[0]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) h2d_tip_events[0].record(stream=h2d_stream) with compute_stream: From 7f8ae975cbe78cc618443576f32205e38306b7e7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:41:53 +0000 Subject: [PATCH 037/242] fixed second to last solve --- src/serinv/algs/pobtas.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index b767b300..9b952100 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -445,24 +445,12 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) - B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize] = ( - la.solve_triangular( - L_diagonal_blocks[n_diag_blocks - 1], - B[ - (n_diag_blocks - 1) - * diag_blocksize : n_diag_blocks - * diag_blocksize - ], - lower=True, - ) - ) - print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) with compute_stream: compute_stream.wait_event(h2d_diagonal_events[0]) compute_stream.wait_event(h2d_B_events[0]) - B_d = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) + B_d[0] = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) From c2b52aaa6009167238355655671658d4b0834924 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:42:53 +0000 Subject: [PATCH 038/242] removed debugging statements --- src/serinv/algs/pobtas.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 9b952100..4d0bbf08 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -439,7 +439,6 @@ def _pobtas_streaming( h2d_diagonal_events[0].record(stream=h2d_stream) B_d[0].set(arr=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=h2d_stream,) - print(B_d[0]) h2d_B_events[0].record(stream=h2d_stream) L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) @@ -454,13 +453,11 @@ def _pobtas_streaming( compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) - print(B_d[0]) - print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) B_d[0].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) d2h_B_events[0].record(stream=d2h_stream) h2d_stream.wait_event(d2h_B_events[0]) - B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream,) h2d_tip_events[0].record(stream=h2d_stream) with compute_stream: From 11f838ecca8400e2f101094aa99f0ba2fecbe989 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:52:28 +0000 Subject: [PATCH 039/242] insert debug statements --- src/serinv/algs/pobtas.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 4d0bbf08..c566faa0 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -465,6 +465,9 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_tip_events[0]) compute_stream.wait_event(compute_partial_events[0]) + print(L_lower_arrow_blocks_d[1]) + print(B_arrow_tip_d) + B_arrow_tip_d -= (L_lower_arrow_blocks_d[1] @ B_arrow_tip_d) compute_partial_events[1].record(stream=compute_stream) From 133d0067825f14d9cadffb62a9b9b693950cf753 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:55:50 +0000 Subject: [PATCH 040/242] fixed index typo --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index c566faa0..9dc7620f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -465,10 +465,10 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_tip_events[0]) compute_stream.wait_event(compute_partial_events[0]) - print(L_lower_arrow_blocks_d[1]) + print(L_lower_arrow_blocks_d[0]) print(B_arrow_tip_d) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[1] @ B_arrow_tip_d) + B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_arrow_tip_d) compute_partial_events[1].record(stream=compute_stream) compute_stream.wait_event(compute_partial_events[1]) From 2ab30872d952a3232e73adafc2674fce87540575 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 13:58:14 +0000 Subject: [PATCH 041/242] changed debug statement --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 9dc7620f..bce926ba 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -465,7 +465,7 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_tip_events[0]) compute_stream.wait_event(compute_partial_events[0]) - print(L_lower_arrow_blocks_d[0]) + print(L_lower_arrow_blocks_d) print(B_arrow_tip_d) B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_arrow_tip_d) From aeca9e1d9d0b71458abd857ec7be704bdca7e693 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 14:01:18 +0000 Subject: [PATCH 042/242] changed operation order --- src/serinv/algs/pobtas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index bce926ba..f0ea1c06 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -468,11 +468,12 @@ def _pobtas_streaming( print(L_lower_arrow_blocks_d) print(B_arrow_tip_d) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_arrow_tip_d) + + B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) compute_partial_events[1].record(stream=compute_stream) compute_stream.wait_event(compute_partial_events[1]) - B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) + B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_arrow_tip_d) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) From 78e8e25c38bd705b52eecf4a25291d53bead75ba Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 14:08:23 +0000 Subject: [PATCH 043/242] changed to right B --- src/serinv/algs/pobtas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f0ea1c06..f9f40b15 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -468,12 +468,11 @@ def _pobtas_streaming( print(L_lower_arrow_blocks_d) print(B_arrow_tip_d) - - B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) + B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_d[0]) compute_partial_events[1].record(stream=compute_stream) compute_stream.wait_event(compute_partial_events[1]) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_arrow_tip_d) + B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) From f8f5b6444d5a0e9b93a59bcffbadf561bfedc3fc Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 14:13:52 +0000 Subject: [PATCH 044/242] setup corrected for out of bounds --- src/serinv/algs/pobtas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f9f40b15..5d940ff9 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -325,8 +325,9 @@ def _pobtas_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) h2d_diagonal_events[1].record(stream=h2d_stream) - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[1], stream=h2d_stream) - h2d_lower_diagonal_events[1].record(stream=h2d_stream) + if n_diag_blocks > 2: + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[1], stream=h2d_stream) + h2d_lower_diagonal_events[1].record(stream=h2d_stream) if trans == "N": From 22c829ac7b3b29fff0909b81cffd06c674b6084e Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 14:15:02 +0000 Subject: [PATCH 045/242] removed debug statements --- src/serinv/algs/pobtas.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 5d940ff9..53a4f14a 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -466,9 +466,6 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_tip_events[0]) compute_stream.wait_event(compute_partial_events[0]) - print(L_lower_arrow_blocks_d) - print(B_arrow_tip_d) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_d[0]) compute_partial_events[1].record(stream=compute_stream) From 79d78f05c653bc00575733a795ad8c13fe7f8ecd Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 14:19:06 +0000 Subject: [PATCH 046/242] insert debug statement --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index cbb7aaed..af1daffc 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -22,7 +22,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - array_type = "streaming" + # array_type = "streaming" A = dd_bta( diagonal_blocksize, @@ -57,6 +57,7 @@ def test_pobtas( ) = bta_dense_to_arrays(A, diagonal_blocksize, arrowhead_blocksize, n_diag_blocks) if backend_flags["cupy_avail"] and array_type == "streaming": + print("streaming") A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks) A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks[:, :, :] A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks) From f00a04a4c5ac9086b3bdaebe36e9424ee38bb7d5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 14:28:46 +0000 Subject: [PATCH 047/242] forced streaming in tests again --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index af1daffc..cbb7aaed 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -22,7 +22,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - # array_type = "streaming" + array_type = "streaming" A = dd_bta( diagonal_blocksize, @@ -57,7 +57,6 @@ def test_pobtas( ) = bta_dense_to_arrays(A, diagonal_blocksize, arrowhead_blocksize, n_diag_blocks) if backend_flags["cupy_avail"] and array_type == "streaming": - print("streaming") A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks) A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks[:, :, :] A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks) From cccd82d45bb6ae00ef866fc49252b9494db56712 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 15:23:36 +0000 Subject: [PATCH 048/242] force streaming in pobtaf for testing --- tests/tests_algs/regular/tests_bta/test_pobtaf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtaf.py b/tests/tests_algs/regular/tests_bta/test_pobtaf.py index a30b9094..ab2e306c 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtaf.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtaf.py @@ -20,6 +20,8 @@ def test_pobtaf( array_type: str, dtype: np.dtype, ): + array_type = "streaming" + A = dd_bta( diagonal_blocksize, arrowhead_blocksize, From 416b0aabece8bf3447eaa17545689b0d1ca55ae5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 25 Apr 2025 15:26:05 +0000 Subject: [PATCH 049/242] removed forced streaming from pobtaf --- tests/tests_algs/regular/tests_bta/test_pobtaf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtaf.py b/tests/tests_algs/regular/tests_bta/test_pobtaf.py index ab2e306c..a30b9094 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtaf.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtaf.py @@ -20,8 +20,6 @@ def test_pobtaf( array_type: str, dtype: np.dtype, ): - array_type = "streaming" - A = dd_bta( diagonal_blocksize, arrowhead_blocksize, From e943999f957ebbf89eb2bcf45b983999b097f749 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 07:31:17 +0000 Subject: [PATCH 050/242] changed stream timing --- src/serinv/algs/pobtas.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 53a4f14a..478b3765 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -336,8 +336,6 @@ def _pobtas_streaming( with compute_stream: # Compute step 1 : compute B compute_stream.wait_event(h2d_diagonal_events[i % 2]) - compute_stream.wait_event(compute_arrow_B_events[i % 2]) - compute_stream.wait_event(compute_current_B_events[(i + 1) % 2]) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2], @@ -364,10 +362,7 @@ def _pobtas_streaming( with compute_stream: # 2 - compute_stream.wait_event(h2d_lower_diagonal_events[i % 2]) compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) - compute_stream.wait_event(compute_current_B_events[i % 2]) - compute_stream.wait_event(compute_next_B_events[(i + 1) % 2]) B_d[(i + 1) % 2] -= ( L_lower_diagonal_blocks_d[i % 2] @ B_d[i % 2] @@ -396,8 +391,6 @@ def _pobtas_streaming( with compute_stream: # 3 compute_stream.wait_event(h2d_arrow_events[i % 2]) - compute_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) - compute_stream.wait_event(compute_next_B_events[i % 2]) if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): compute_stream.wait_event(h2d_tip_events[i % 2]) @@ -448,7 +441,6 @@ def _pobtas_streaming( with compute_stream: - compute_stream.wait_event(h2d_diagonal_events[0]) compute_stream.wait_event(h2d_B_events[0]) B_d[0] = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) compute_partial_events[0].record(stream=compute_stream) @@ -462,9 +454,7 @@ def _pobtas_streaming( h2d_tip_events[0].record(stream=h2d_stream) with compute_stream: - compute_stream.wait_event(h2d_arrow_events[0]) compute_stream.wait_event(h2d_tip_events[0]) - compute_stream.wait_event(compute_partial_events[0]) B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_d[0]) compute_partial_events[1].record(stream=compute_stream) From 8a05718b5f38085deb21c8c08ccaf9857f170639 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 07:32:54 +0000 Subject: [PATCH 051/242] added sync --- src/serinv/algs/pobtas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 478b3765..bba93898 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -480,4 +480,7 @@ def _pobtas_streaming( # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} else: - raise ValueError(f"Invalid transpose argument: {trans}.") \ No newline at end of file + raise ValueError(f"Invalid transpose argument: {trans}.") + + + cp.cuda.Device().synchronize() \ No newline at end of file From cef552cad68e416e73c101e0b9fce7c1035c858d Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 11:31:55 +0000 Subject: [PATCH 052/242] insert debug statements --- src/serinv/algs/pobtas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index bba93898..46f201c1 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -331,6 +331,10 @@ def _pobtas_streaming( if trans == "N": + + print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) + print(B[-arrow_blocksize:]) + for i in range(0, n_diag_blocks - 1): # --- Forward substitution --- with compute_stream: From 18a8b8f0015d2e8fe48d0fd02b3b80c1cf8423a8 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 11:33:41 +0000 Subject: [PATCH 053/242] insert antoher debug statement --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 46f201c1..40142a77 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -332,6 +332,7 @@ def _pobtas_streaming( if trans == "N": + print(B) print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) print(B[-arrow_blocksize:]) From 8a1f9f3fbdaf79b4b64aa95c93b69f13fdff0fef Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 11:40:14 +0000 Subject: [PATCH 054/242] removed misguided overlap protection --- src/serinv/algs/pobtas.py | 45 ++++++--------------------------------- 1 file changed, 7 insertions(+), 38 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 40142a77..ab9df1ea 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -331,11 +331,6 @@ def _pobtas_streaming( if trans == "N": - - print(B) - print(B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize]) - print(B[-arrow_blocksize:]) - for i in range(0, n_diag_blocks - 1): # --- Forward substitution --- with compute_stream: @@ -352,8 +347,6 @@ def _pobtas_streaming( h2d_stream.wait_event(compute_current_B_events[i % 2]) L_diagonal_blocks_d[(i + 2) % 2].set(arr=L_diagonal_blocks[i + 2], stream=h2d_stream) h2d_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) - if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): - B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream,) d2h_stream.wait_event(compute_current_B_events[i % 2]) B_d[i % 2].get( @@ -378,26 +371,10 @@ def _pobtas_streaming( h2d_stream.wait_event(compute_next_B_events[i % 2]) L_lower_diagonal_blocks_d[(i + 2) % 2].set(arr=L_lower_diagonal_blocks[i + 2], stream=h2d_stream) h2d_lower_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) - - if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): - d2h_stream.wait_event(compute_next_B_events[i % 2]) - B_d[(i + 1) % 2].get( - out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], - stream=d2h_stream, - blocking=False, - ) - d2h_B_events[(i + 1) % 2].record(stream=d2h_stream) - - h2d_stream.wait_event(d2h_B_events[(i + 1) % 2]) - B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream,) - h2d_tip_events[i % 2].record(stream=h2d_stream) - with compute_stream: # 3 compute_stream.wait_event(h2d_arrow_events[i % 2]) - if not ((i + 2) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): - compute_stream.wait_event(h2d_tip_events[i % 2]) B_arrow_tip_d -= ( L_lower_arrow_blocks_d[i % 2] @@ -405,22 +382,14 @@ def _pobtas_streaming( ) compute_arrow_B_events[i % 2].record(stream=compute_stream) - - - # make sure that arrowtip and B overlap gets resolved - if ((i + 3) * diag_blocksize) < (n_diag_blocks * diag_blocksize - arrow_blocksize): - h2d_stream.wait_event(compute_arrow_B_events[i % 2]) - B_d[(i + 2) % 2].set(arr=B[(i + 2) * diag_blocksize : (i + 3) * diag_blocksize], stream = h2d_stream) - h2d_B_events[(i + 2) % 2].record(stream=h2d_stream) - else: - d2h_stream.wait_event(compute_arrow_B_events[i % 2]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - d2h_tip_events[i % 2].record(stream=d2h_stream) - - if i + 1 < n_diag_blocks - 1: - B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream = h2d_stream) - h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + d2h_stream.wait_event(compute_arrow_B_events[i % 2]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + d2h_tip_events[i % 2].record(stream=d2h_stream) + + if i + 1 < n_diag_blocks - 1: + B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream = h2d_stream) + h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) if i + 2 < n_diag_blocks - 1: From 5acc905905965d1619ee4816b52a1c1034d84e53 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 12:31:38 +0000 Subject: [PATCH 055/242] changed streaming order --- src/serinv/algs/pobtas.py | 96 +++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index ab9df1ea..a2f6799a 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -304,76 +304,104 @@ def _pobtas_streaming( L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) h2d_diagonal_events[0].record(stream=h2d_stream) - #L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - #h2d_lower_diagonal_events[0].record(stream=h2d_stream) - L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[0], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) # --- D2H: event --- d2h_B_events[1].record(stream=d2h_stream) + n_diag_blocks: int = L_diagonal_blocks.shape[0] - n_diag_blocks: int = L_diagonal_blocks.shape[0] # why? - if n_diag_blocks > 1: - B_d[1].set(arr=B[1 * diag_blocksize : 2 * diag_blocksize], stream = h2d_stream) - h2d_B_events[1].record(stream=h2d_stream) - - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - h2d_lower_diagonal_events[0].record(stream=h2d_stream) - - L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - h2d_diagonal_events[1].record(stream=h2d_stream) + # if n_diag_blocks > 1: - if n_diag_blocks > 2: - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[1], stream=h2d_stream) - h2d_lower_diagonal_events[1].record(stream=h2d_stream) + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + h2d_lower_diagonal_events[0].record(stream=h2d_stream) if trans == "N": for i in range(0, n_diag_blocks - 1): # --- Forward substitution --- + + if i + 1 < n_diag_blocks - 1: + # stream next B block + h2d_stream.wait_event(d2h_B_events[(i + 1) % 2]) + + B_d[(i + 1) % 2].set( + arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream = h2d_stream + ) + + h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + + # stream next diagonal block + h2d_stream.wait_event(compute_current_B_events[(i + 1) % 2]) + + L_diagonal_blocks_d[(i + 1) % 2].set( + arr=L_diagonal_blocks[i + 1], + stream=h2d_stream + ) + + h2d_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) + + with compute_stream: # Compute step 1 : compute B compute_stream.wait_event(h2d_diagonal_events[i % 2]) + B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2], lower=True, ) + compute_current_B_events[i % 2].record(stream=compute_stream) - if i + 2 < n_diag_blocks - 1: - h2d_stream.wait_event(compute_current_B_events[i % 2]) - L_diagonal_blocks_d[(i + 2) % 2].set(arr=L_diagonal_blocks[i + 2], stream=h2d_stream) - h2d_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) - + # stream B back d2h_stream.wait_event(compute_current_B_events[i % 2]) + B_d[i % 2].get( out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False, ) + d2h_B_events[i % 2].record(stream=d2h_stream) - + if i + 1 < n_diag_blocks - 1: + # stream next lower diagonal block + h2d_stream.wait_event(compute_next_B_events[(i + 1) % 2]) + + L_lower_diagonal_blocks_d[(i + 1) % 2].set( + arr=L_lower_diagonal_blocks[i + 1], + stream=h2d_stream + ) + + h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: - # 2 + # Compute step 2 : update next B compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2] -= ( L_lower_diagonal_blocks_d[i % 2] @ B_d[i % 2] ) - compute_next_B_events[i % 2].record(stream=compute_stream) - if i + 2 < n_diag_blocks - 1: - h2d_stream.wait_event(compute_next_B_events[i % 2]) - L_lower_diagonal_blocks_d[(i + 2) % 2].set(arr=L_lower_diagonal_blocks[i + 2], stream=h2d_stream) - h2d_lower_diagonal_events[(i + 2) % 2].record(stream=h2d_stream) + compute_next_B_events[i % 2].record(stream=compute_stream) + if i + 1 < n_diag_blocks - 1: + # stream next lower arrow block + h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) + + L_lower_arrow_blocks_d[(i + 1) % 2].set( + arr=L_lower_arrow_blocks[i + 1], + stream=h2d_stream + ) + + h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) + with compute_stream: - # 3 + # Compute step 3 : update arrowtip compute_stream.wait_event(h2d_arrow_events[i % 2]) B_arrow_tip_d -= ( @@ -386,16 +414,6 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_arrow_B_events[i % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) d2h_tip_events[i % 2].record(stream=d2h_stream) - - if i + 1 < n_diag_blocks - 1: - B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream = h2d_stream) - h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) - - - if i + 2 < n_diag_blocks - 1: - - L_lower_arrow_blocks_d[(i + 1) % 2].set(arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream) - h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) if not partial: From c45adc911ce8b5bd94c93279074d26cf5534497b Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 12:32:25 +0000 Subject: [PATCH 056/242] rolled back if statement --- src/serinv/algs/pobtas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index a2f6799a..7d59cb0f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -312,10 +312,10 @@ def _pobtas_streaming( n_diag_blocks: int = L_diagonal_blocks.shape[0] - # if n_diag_blocks > 1: + if n_diag_blocks > 1: - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - h2d_lower_diagonal_events[0].record(stream=h2d_stream) + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + h2d_lower_diagonal_events[0].record(stream=h2d_stream) if trans == "N": From 37518945528c512ff98222231fea9ffdcdda7862 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 12:37:38 +0000 Subject: [PATCH 057/242] debug statement to check if the last block is the problem --- src/serinv/algs/pobtas.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 7d59cb0f..0384545d 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -319,8 +319,8 @@ def _pobtas_streaming( if trans == "N": + # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): - # --- Forward substitution --- if i + 1 < n_diag_blocks - 1: # stream next B block @@ -419,6 +419,11 @@ def _pobtas_streaming( if not partial: # In the case of the partial solve, we do not solve the last block and # arrow tip block of the RHS. + + raise NotImplementedError( + "wrong." + ) + h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) @@ -464,8 +469,8 @@ def _pobtas_streaming( if not partial: # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) raise NotImplementedError( - "T and C not yet implemented." - ) + "T and C not yet implemented." + ) # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) # for i in range(n_diag_blocks -2, -1, -1): From c6f63d1c9b261bb1c79d39849d211eafe6966f00 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 12:48:24 +0000 Subject: [PATCH 058/242] changed non partial solve --- src/serinv/algs/pobtas.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0384545d..dde50341 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -419,41 +419,29 @@ def _pobtas_streaming( if not partial: # In the case of the partial solve, we do not solve the last block and # arrow tip block of the RHS. - - raise NotImplementedError( - "wrong." - ) - h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) h2d_diagonal_events[0].record(stream=h2d_stream) - B_d[0].set(arr=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=h2d_stream,) - h2d_B_events[0].record(stream=h2d_stream) - L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) with compute_stream: - compute_stream.wait_event(h2d_B_events[0]) - B_d[0] = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[0], lower=True,)) + compute_stream.wait_event(h2d_diagonal_events[0]) + B_d[(n_diag_blocks - 1) % 2] = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[(n_diag_blocks - 1) % 2], lower=True,)) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) B_d[0].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) d2h_B_events[0].record(stream=d2h_stream) - h2d_stream.wait_event(d2h_B_events[0]) - B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=d2h_stream,) - h2d_tip_events[0].record(stream=h2d_stream) - with compute_stream: - compute_stream.wait_event(h2d_tip_events[0]) + compute_stream.wait_event(h2d_arrow_events[0]) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_d[0]) + B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_d[(n_diag_blocks - 1) % 2]) compute_partial_events[1].record(stream=compute_stream) compute_stream.wait_event(compute_partial_events[1]) From ee1798c16e7f1bd73e74c9089389e59a8c4d692b Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 12:52:12 +0000 Subject: [PATCH 059/242] debug to see passed tests --- src/serinv/algs/pobtas.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index dde50341..7865616b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -417,6 +417,9 @@ def _pobtas_streaming( if not partial: + raise NotImplementedError( + "just error display" + ) # In the case of the partial solve, we do not solve the last block and # arrow tip block of the RHS. From c63a5de2dc4b9f0d463dad62a67d976e87fe025c Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 12:57:35 +0000 Subject: [PATCH 060/242] inserted debug statements to compare B --- src/serinv/algs/pobtas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 7865616b..e89c9273 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -319,6 +319,7 @@ def _pobtas_streaming( if trans == "N": + print(B) # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): @@ -401,7 +402,7 @@ def _pobtas_streaming( h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: - # Compute step 3 : update arrowtip + # Compute step 3 : update arrow tip compute_stream.wait_event(h2d_arrow_events[i % 2]) B_arrow_tip_d -= ( @@ -417,9 +418,6 @@ def _pobtas_streaming( if not partial: - raise NotImplementedError( - "just error display" - ) # In the case of the partial solve, we do not solve the last block and # arrow tip block of the RHS. @@ -454,6 +452,8 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_partial_events[0]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + print(B) + elif trans == "T" or trans == "C": # ----- Backward substitution ----- From 2083e06c77ec523c0217344af1fe6d798122269b Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 13:02:06 +0000 Subject: [PATCH 061/242] changed arrow tip block --- src/serinv/algs/pobtas.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index e89c9273..1543d657 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -295,7 +295,7 @@ def _pobtas_streaming( compute_arrow_B_events[1].record(stream=compute_stream) B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) - L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:, :], stream=h2d_stream) + L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) # --- H2D: transfers --- B_d[0].set(arr=B[0 : 1 * diag_blocksize], stream = h2d_stream) @@ -319,7 +319,6 @@ def _pobtas_streaming( if trans == "N": - print(B) # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): @@ -452,9 +451,6 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_partial_events[0]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - print(B) - - elif trans == "T" or trans == "C": # ----- Backward substitution ----- if not partial: From 7805321b36b6c50e73ada04ae28e65c0ed65842e Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 13:23:38 +0000 Subject: [PATCH 062/242] changed stream timing --- src/serinv/algs/pobtas.py | 134 +++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 1543d657..96be4853 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -241,90 +241,90 @@ def _pobtas_streaming( cp, cu_la = _get_module_from_str(module_str="cupy") - # Streams and events - compute_stream = cp.cuda.Stream(non_blocking=True) - h2d_stream = cp.cuda.Stream(non_blocking=True) - d2h_stream = cp.cuda.Stream(non_blocking=True) + if trans == "N": - h2d_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] - h2d_lower_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] - h2d_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] - h2d_B_events = [cp.cuda.Event(), cp.cuda.Event()] - h2d_tip_events = [cp.cuda.Event(), cp.cuda.Event()] + # Streams and events + compute_stream = cp.cuda.Stream(non_blocking=True) + h2d_stream = cp.cuda.Stream(non_blocking=True) + d2h_stream = cp.cuda.Stream(non_blocking=True) - d2h_B_events = [cp.cuda.Event(), cp.cuda.Event()] - d2h_tip_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_lower_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_B_events = [cp.cuda.Event(), cp.cuda.Event()] - compute_current_B_events = [cp.cuda.Event(), cp.cuda.Event()] - compute_next_B_events = [cp.cuda.Event(), cp.cuda.Event()] - compute_arrow_B_events = [cp.cuda.Event(), cp.cuda.Event()] + d2h_B_events = [cp.cuda.Event(), cp.cuda.Event()] + d2h_tip_events = [cp.cuda.Event(), cp.cuda.Event()] - compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] + compute_current_B_events = [cp.cuda.Event(), cp.cuda.Event()] + compute_next_B_events = [cp.cuda.Event(), cp.cuda.Event()] + compute_arrow_B_events = [cp.cuda.Event(), cp.cuda.Event()] - # Vars - diag_blocksize = L_diagonal_blocks.shape[1] - arrow_blocksize = L_lower_arrow_blocks.shape[1] - n_diag_blocks = L_diagonal_blocks.shape[0] + compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] - # Device Buffers - # B Buffers - B_shape = B[0 : diag_blocksize] # block template - B_d = cp.empty( - (2, *B_shape.shape), dtype=B_shape.dtype - ) - B_shape = B[-arrow_blocksize:] - B_arrow_tip_d = cp.empty_like(B_shape) - del B_shape - - # L Buffers - L_diagonal_blocks_d = cp.empty( - (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype - ) - L_lower_diagonal_blocks_d = cp.empty( - (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype - ) - L_lower_arrow_blocks_d = cp.empty( - (2, *L_lower_arrow_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype - ) - L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) - - # Forward Pass - # --- C: events + transfers --- - compute_current_B_events[1].record(stream=compute_stream) - compute_next_B_events[1].record(stream=compute_stream) - compute_arrow_B_events[1].record(stream=compute_stream) - - B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) - L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) - - # --- H2D: transfers --- - B_d[0].set(arr=B[0 : 1 * diag_blocksize], stream = h2d_stream) - h2d_B_events[0].record(stream=h2d_stream) - - L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) - h2d_diagonal_events[0].record(stream=h2d_stream) + # Vars + diag_blocksize = L_diagonal_blocks.shape[1] + arrow_blocksize = L_lower_arrow_blocks.shape[1] + n_diag_blocks = L_diagonal_blocks.shape[0] - L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[0], stream=h2d_stream) - h2d_arrow_events[0].record(stream=h2d_stream) + # Device Buffers + # B Buffers + B_shape = B[0 : diag_blocksize] # block template + B_d = cp.empty( + (2, *B_shape.shape), dtype=B_shape.dtype + ) + B_shape = B[-arrow_blocksize:] + B_arrow_tip_d = cp.empty_like(B_shape) + del B_shape - # --- D2H: event --- - d2h_B_events[1].record(stream=d2h_stream) - - n_diag_blocks: int = L_diagonal_blocks.shape[0] + # L Buffers + L_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_arrow_blocks_d = cp.empty( + (2, *L_lower_arrow_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) - if n_diag_blocks > 1: + # Forward Pass + # --- C: events + transfers --- + compute_current_B_events[1].record(stream=compute_stream) + compute_next_B_events[1].record(stream=compute_stream) + compute_arrow_B_events[1].record(stream=compute_stream) - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - h2d_lower_diagonal_events[0].record(stream=h2d_stream) + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) + L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) + # --- H2D: transfers --- + B_d[0].set(arr=B[0 : diag_blocksize], stream = h2d_stream) + h2d_B_events[0].record(stream=h2d_stream) + + L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) + h2d_diagonal_events[0].record(stream=h2d_stream) - if trans == "N": + L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[0], stream=h2d_stream) + h2d_arrow_events[0].record(stream=h2d_stream) + + # --- D2H: event --- + d2h_B_events[1].record(stream=d2h_stream) + + n_diag_blocks: int = L_diagonal_blocks.shape[0] + + if n_diag_blocks > 1: + + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + h2d_lower_diagonal_events[0].record(stream=h2d_stream) + + # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): if i + 1 < n_diag_blocks - 1: # stream next B block - h2d_stream.wait_event(d2h_B_events[(i + 1) % 2]) + h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set( arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], From 12157121a274b395324ee6290245817a4930505e Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 13:37:59 +0000 Subject: [PATCH 063/242] changed if to stream b + 1 --- src/serinv/algs/pobtas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 96be4853..50156d13 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -322,7 +322,7 @@ def _pobtas_streaming( # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): - if i + 1 < n_diag_blocks - 1: + if i < n_diag_blocks - 1: # stream next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) @@ -333,6 +333,7 @@ def _pobtas_streaming( h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + if i + 1 < n_diag_blocks - 1: # stream next diagonal block h2d_stream.wait_event(compute_current_B_events[(i + 1) % 2]) From b509c0ddc784138733d16d900b1db324cd7076f8 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 13:40:16 +0000 Subject: [PATCH 064/242] debug changed to check n --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 50156d13..23549fad 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -322,7 +322,7 @@ def _pobtas_streaming( # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): - if i < n_diag_blocks - 1: + if i + 1 < n_diag_blocks - 1: # stream next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) From c0d0c330ed17cfe667c2c7e2dac436c7911cdfcc Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 13:40:58 +0000 Subject: [PATCH 065/242] consitentcy update --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 23549fad..6dd1bfdd 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -322,7 +322,7 @@ def _pobtas_streaming( # --- Forward substitution --- for i in range(0, n_diag_blocks - 1): - if i + 1 < n_diag_blocks - 1: + if i + 1 < n_diag_blocks: # stream next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) From 6c45c15b198f07182e58423a22270ee56e23f69b Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 13:50:04 +0000 Subject: [PATCH 066/242] changed non partial part --- src/serinv/algs/pobtas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 6dd1bfdd..ccb34f20 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -422,8 +422,8 @@ def _pobtas_streaming( # arrow tip block of the RHS. h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) - L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) - h2d_diagonal_events[0].record(stream=h2d_stream) + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) + h2d_diagonal_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_arrow_events[0].record(stream=h2d_stream) @@ -432,7 +432,7 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_diagonal_events[0]) - B_d[(n_diag_blocks - 1) % 2] = (cu_la.solve_triangular(L_diagonal_blocks_d[0], B_d[(n_diag_blocks - 1) % 2], lower=True,)) + B_d[(n_diag_blocks - 1) % 2] = (cu_la.solve_triangular(L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2], lower=True,)) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) From ba9f28ead0b50feb26446bf1494526220d45f196 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 14:01:17 +0000 Subject: [PATCH 067/242] changed non partial block to match indexing --- src/serinv/algs/pobtas.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index ccb34f20..33979a92 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -425,24 +425,24 @@ def _pobtas_streaming( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) h2d_diagonal_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) - L_lower_arrow_blocks_d[0].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) - h2d_arrow_events[0].record(stream=h2d_stream) + L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) + h2d_arrow_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) with compute_stream: - compute_stream.wait_event(h2d_diagonal_events[0]) + compute_stream.wait_event(h2d_diagonal_events[(n_diag_blocks - 1) % 2]) B_d[(n_diag_blocks - 1) % 2] = (cu_la.solve_triangular(L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2], lower=True,)) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) - B_d[0].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) + B_d[(n_diag_blocks - 1) % 2].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) d2h_B_events[0].record(stream=d2h_stream) with compute_stream: - compute_stream.wait_event(h2d_arrow_events[0]) + compute_stream.wait_event(h2d_arrow_events[(n_diag_blocks - 1) % 2]) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[0] @ B_d[(n_diag_blocks - 1) % 2]) + B_arrow_tip_d -= (L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2] @ B_d[(n_diag_blocks - 1) % 2]) compute_partial_events[1].record(stream=compute_stream) compute_stream.wait_event(compute_partial_events[1]) From 90d6a747dd4c6cd285c7f6aeb17945333181f1cc Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 20:40:34 +0000 Subject: [PATCH 068/242] first attempt at backward solve --- src/serinv/algs/pobtas.py | 157 +++++++++++++----- .../regular/tests_bta/test_pobtas.py | 1 + 2 files changed, 121 insertions(+), 37 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 33979a92..6da26720 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -241,13 +241,47 @@ def _pobtas_streaming( cp, cu_la = _get_module_from_str(module_str="cupy") + # Vars + diag_blocksize = L_diagonal_blocks.shape[1] + arrow_blocksize = L_lower_arrow_blocks.shape[1] + n_diag_blocks = L_diagonal_blocks.shape[0] + + # Streams + compute_stream = cp.cuda.Stream(non_blocking=True) + h2d_stream = cp.cuda.Stream(non_blocking=True) + d2h_stream = cp.cuda.Stream(non_blocking=True) + + + + # Device Buffers + # B Buffers + B_shape = B[-arrow_blocksize:] # block template + B_arrow_tip_d = cp.empty_like(B_shape) + + B_shape = B[0 : diag_blocksize] + B_d = cp.empty( + (2, *B_shape.shape), dtype=B_shape.dtype + ) + + + # L Buffers + L_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_arrow_blocks_d = cp.empty( + (2, *L_lower_arrow_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) + if trans == "N": - # Streams and events - compute_stream = cp.cuda.Stream(non_blocking=True) - h2d_stream = cp.cuda.Stream(non_blocking=True) - d2h_stream = cp.cuda.Stream(non_blocking=True) + # delete helper variable + del B_shape + # Events h2d_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_lower_diagonal_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_arrow_events = [cp.cuda.Event(), cp.cuda.Event()] @@ -262,33 +296,6 @@ def _pobtas_streaming( compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] - # Vars - diag_blocksize = L_diagonal_blocks.shape[1] - arrow_blocksize = L_lower_arrow_blocks.shape[1] - n_diag_blocks = L_diagonal_blocks.shape[0] - - # Device Buffers - # B Buffers - B_shape = B[0 : diag_blocksize] # block template - B_d = cp.empty( - (2, *B_shape.shape), dtype=B_shape.dtype - ) - B_shape = B[-arrow_blocksize:] - B_arrow_tip_d = cp.empty_like(B_shape) - del B_shape - - # L Buffers - L_diagonal_blocks_d = cp.empty( - (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype - ) - L_lower_diagonal_blocks_d = cp.empty( - (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype - ) - L_lower_arrow_blocks_d = cp.empty( - (2, *L_lower_arrow_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype - ) - L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) - # Forward Pass # --- C: events + transfers --- compute_current_B_events[1].record(stream=compute_stream) @@ -453,16 +460,92 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) elif trans == "T" or trans == "C": + # Buffers + B_previous_d = cp.empty_like(B_shape) + del B_shape + + # Events + compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_events = [cp.cuda.Event(), cp.cuda.Event()] + d2h_events = [cp.cuda.Event(), cp.cuda.Event()] + + # Forward Pass + # --- C: events + transfers --- + + B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) + L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) + B_d[(n_diag_blocks - 1) % 2].set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) + L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) + + h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + if n_diag_blocks > 1: + B_d[n_diag_blocks % 2].set( + arr=B[-arrow_blocksize - 2 * diag_blocksize : -arrow_blocksize - diag_blocksize], + stream=h2d_stream + ) + # ----- Backward substitution ----- if not partial: # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) - raise NotImplementedError( - "T and C not yet implemented." - ) - # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) + with compute_stream: + compute_stream.wait_event(h2d_events[n_diag_blocks % 2]) + B_arrow_tip_d = cu_la.solve_triangular( + L_arrow_tip_block_d, + B_arrow_tip_d, + lower=True, + trans="C", + ) - # for i in range(n_diag_blocks -2, -1, -1): - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + B_d[(n_diag_blocks -1) % 2] = ( + cu_la.solve_triangular( + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], + B_d[(n_diag_blocks - 1) % 2] + - L_lower_arrow_blocks[(n_diag_blocks - 1) % 2].conj().T @ B_arrow_tip_d, + lower=True, + trans="C", + ) + ) + + compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) + + d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_d.get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) + d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) + + for i in range(n_diag_blocks - 2, -1, -1): + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + + if i > 0: + with h2d_stream: + h2d_stream.wait_event(d2h_events[(i + 1) % 2]) + + B_previous_d = B_d[(i + 1) % 2] + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize]) + L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1]) + L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1]) + L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1]) + + h2d_events[(i + 1) % 2].record(stream=h2d_stream) + + with compute_stream: + compute_stream.wait_event(h2d_events[i % 2]) + + B_d[i % 2] = cu_la.solve_triangular( + L_diagonal_blocks_d[i % 2], + B_d[i % 2] + - L_lower_diagonal_blocks_d[i % 2].conj().T + @ B_previous_d + - L_lower_arrow_blocks_d[i % 2].conj().T @ B_arrow_tip_d, + lower=True, + trans="C", + ) + + compute_B_events[i % 2].record(compute_stream) + + B_d[i % 2].get(out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False) + d2h_events[i % 2].record(stream=d2h_stream) else: raise ValueError(f"Invalid transpose argument: {trans}.") diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index cbb7aaed..4c51e79c 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -100,6 +100,7 @@ def test_pobtas( A_arrow_tip_block, B, trans="C", + device_streaming=True if array_type == "streaming" else False, ) assert xp.allclose(B, X_ref) From 14a9215f400a83a1d69642c1c0fd901bbf908ed3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 20:48:25 +0000 Subject: [PATCH 069/242] fixed typo --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 6da26720..9ce95372 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -501,7 +501,7 @@ def _pobtas_streaming( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2] - - L_lower_arrow_blocks[(n_diag_blocks - 1) % 2].conj().T @ B_arrow_tip_d, + - L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].conj().T @ B_arrow_tip_d, lower=True, trans="C", ) From 3ce3b0aef4cebe1d6c9368ee03bbf79e009a5855 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 20:50:58 +0000 Subject: [PATCH 070/242] another typo --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 9ce95372..fd811ef5 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -497,7 +497,7 @@ def _pobtas_streaming( trans="C", ) - B_d[(n_diag_blocks -1) % 2] = ( + B_d[(n_diag_blocks - 1) % 2] = ( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2] @@ -511,7 +511,7 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - B_d.get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) + B_d[(n_diag_blocks - 1) % 2].get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) for i in range(n_diag_blocks - 2, -1, -1): From 99c2e3d784bde472f1cf8648f7c59c90bac0e729 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:03:42 +0000 Subject: [PATCH 071/242] insert parenthesis --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index fd811ef5..94dfccbb 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -481,7 +481,7 @@ def _pobtas_streaming( h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) if n_diag_blocks > 1: B_d[n_diag_blocks % 2].set( - arr=B[-arrow_blocksize - 2 * diag_blocksize : -arrow_blocksize - diag_blocksize], + arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) From af5f83d8310f69c888c6af5016ca8ccc1ee6fec7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:13:32 +0000 Subject: [PATCH 072/242] insert debug staetments --- src/serinv/algs/pobtas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 94dfccbb..3b8e38e3 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -520,14 +520,15 @@ def _pobtas_streaming( if i > 0: with h2d_stream: h2d_stream.wait_event(d2h_events[(i + 1) % 2]) - + print(B_d[(i + 1) % 2]) B_previous_d = B_d[(i + 1) % 2] + print(B_previous_d) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize]) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1]) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1]) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1]) - h2d_events[(i + 1) % 2].record(stream=h2d_stream) + h2d_events[(i - 1) % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) From 799af09d084964bbae435960dd727346b771b043 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:15:27 +0000 Subject: [PATCH 073/242] more debug --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 3b8e38e3..af83d80c 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -516,7 +516,7 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - + print(B) if i > 0: with h2d_stream: h2d_stream.wait_event(d2h_events[(i + 1) % 2]) From 5cc569eac5e1937181dfde3a3d5df8dc9a803873 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:27:42 +0000 Subject: [PATCH 074/242] added missing streaming --- src/serinv/algs/pobtas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index af83d80c..78a2c541 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -484,6 +484,8 @@ def _pobtas_streaming( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) + L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) + L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) # ----- Backward substitution ----- if not partial: @@ -516,13 +518,10 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - print(B) if i > 0: with h2d_stream: h2d_stream.wait_event(d2h_events[(i + 1) % 2]) - print(B_d[(i + 1) % 2]) B_previous_d = B_d[(i + 1) % 2] - print(B_previous_d) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize]) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1]) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1]) From fa95f1614daf34600dfa8ccf88945abbc1ec97eb Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:31:46 +0000 Subject: [PATCH 075/242] added debug statements --- src/serinv/algs/pobtas.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 78a2c541..e58854e6 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -484,8 +484,11 @@ def _pobtas_streaming( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) + print(B) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) + print(L_diagonal_blocks_d[n_diag_blocks % 2]) + print(L_lower_arrow_blocks_d[n_diag_blocks % 2]) # ----- Backward substitution ----- if not partial: From b394b91a4101c52185bed3b9af698541f13f45cb Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:32:38 +0000 Subject: [PATCH 076/242] changed debug --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index e58854e6..bd583559 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -484,7 +484,7 @@ def _pobtas_streaming( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) - print(B) + print(L_diagonal_blocks) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) print(L_diagonal_blocks_d[n_diag_blocks % 2]) From 28220ad9571abe9214252f745310a8d2ca93c527 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 21:51:35 +0000 Subject: [PATCH 077/242] new debug statements --- src/serinv/algs/pobtas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index bd583559..e79cf865 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -484,11 +484,10 @@ def _pobtas_streaming( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) - print(L_diagonal_blocks) + print(B) + print(B_d[n_diag_blocks % 2]) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) - print(L_diagonal_blocks_d[n_diag_blocks % 2]) - print(L_lower_arrow_blocks_d[n_diag_blocks % 2]) # ----- Backward substitution ----- if not partial: From 7e19033b53ac506461a7b6974f3b4b5759769d1c Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:00:52 +0000 Subject: [PATCH 078/242] new debugs --- src/serinv/algs/pobtas.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index e79cf865..f4ae40a6 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -484,8 +484,6 @@ def _pobtas_streaming( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) - print(B) - print(B_d[n_diag_blocks % 2]) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) @@ -533,7 +531,7 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - + print(B_previous_d) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 813638169f6cd7d42bdd31c7b0a349ca5a1ad8f9 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:04:18 +0000 Subject: [PATCH 079/242] changed stream timing --- src/serinv/algs/pobtas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f4ae40a6..52da1e2a 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -518,10 +518,13 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + with compute_stream: + B_previous_d = B_d[(i + 1) % 2] + if i > 0: with h2d_stream: h2d_stream.wait_event(d2h_events[(i + 1) % 2]) - B_previous_d = B_d[(i + 1) % 2] + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize]) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1]) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1]) From cfa8307aa92e2d7dc5475a3cd461c4e9f91fe82f Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:08:24 +0000 Subject: [PATCH 080/242] adjusted stram timing --- src/serinv/algs/pobtas.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 52da1e2a..65aa5e8f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -466,6 +466,7 @@ def _pobtas_streaming( # Events compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] + previous_B_event = cp.cuda.Event() h2d_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_events = [cp.cuda.Event(), cp.cuda.Event()] @@ -519,18 +520,20 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} with compute_stream: + compute_stream.wait_event(d2h_events[(i + 1) % 2]) B_previous_d = B_d[(i + 1) % 2] + previous_B_event.record(stream=compute_stream) if i > 0: - with h2d_stream: - h2d_stream.wait_event(d2h_events[(i + 1) % 2]) + + h2d_stream.wait_event(previous_B_event) - B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize]) - L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1]) - L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1]) - L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1]) + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) + L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) + L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) - h2d_events[(i - 1) % 2].record(stream=h2d_stream) + h2d_events[(i - 1) % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) From 304b3687c9d7d51498be40e517ac696b1c1f7997 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:26:50 +0000 Subject: [PATCH 081/242] changed event recording --- src/serinv/algs/pobtas.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 65aa5e8f..f1c5e39d 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -479,7 +479,7 @@ def _pobtas_streaming( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) - h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) if n_diag_blocks > 1: B_d[n_diag_blocks % 2].set( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], @@ -488,11 +488,15 @@ def _pobtas_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) + h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + + + # ----- Backward substitution ----- if not partial: # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) with compute_stream: - compute_stream.wait_event(h2d_events[n_diag_blocks % 2]) + compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d = cu_la.solve_triangular( L_arrow_tip_block_d, B_arrow_tip_d, From dd82d4bfc537040f3a72c9a4a5dd4258c5d64f53 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:31:30 +0000 Subject: [PATCH 082/242] more debug --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f1c5e39d..3845b3c2 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -531,7 +531,7 @@ def _pobtas_streaming( if i > 0: h2d_stream.wait_event(previous_B_event) - + print("ping") B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -541,7 +541,7 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - print(B_previous_d) + print("pong") B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 5370943e1cd4e2ae0e4c69c523da181b78ef90e7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:39:54 +0000 Subject: [PATCH 083/242] insert first compare debug --- src/serinv/algs/pobtas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 3845b3c2..0443cb1a 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -531,7 +531,7 @@ def _pobtas_streaming( if i > 0: h2d_stream.wait_event(previous_B_event) - print("ping") + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -541,7 +541,9 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - print("pong") + print(i) + print(L_diagonal_blocks) + print(L_diagonal_blocks_d[i % 2]) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 38289105b743b5dd72e51e687845aa03c2145405 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:42:40 +0000 Subject: [PATCH 084/242] second debug compare --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0443cb1a..3a4aa4f0 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -542,8 +542,8 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) print(i) - print(L_diagonal_blocks) - print(L_diagonal_blocks_d[i % 2]) + print(L_lower_diagonal_blocks) + print(L_lower_diagonal_blocks_d[i % 2]) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From e36f83d118b8632e7f30034f9d60b59f04b34371 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:52:27 +0000 Subject: [PATCH 085/242] inserted lower diagonal blocks streaming --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 3a4aa4f0..8af88598 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -487,6 +487,7 @@ def _pobtas_streaming( ) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) + L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) From f71a31511881fe086d9303e318642acb239b1f4a Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:54:55 +0000 Subject: [PATCH 086/242] debug compare 3 --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 8af88598..3a18661f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -543,8 +543,8 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) print(i) - print(L_lower_diagonal_blocks) - print(L_lower_diagonal_blocks_d[i % 2]) + print(L_lower_arrow_blocks) + print(L_lower_arrow_blocks_d[i % 2]) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From a2ddd30aed7d2bcf2a805f5cbecb8edaa54d5323 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 22:59:00 +0000 Subject: [PATCH 087/242] compare 4 --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 3a18661f..4055dc58 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -543,8 +543,8 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) print(i) - print(L_lower_arrow_blocks) - print(L_lower_arrow_blocks_d[i % 2]) + print(B) + print(B_previous_d) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 5d22f9476dba71913b661c886e5306a60b9f49c9 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:06:56 +0000 Subject: [PATCH 088/242] changed location of B_previous --- src/serinv/algs/pobtas.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 4055dc58..3ab6031e 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -524,19 +524,16 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - with compute_stream: - compute_stream.wait_event(d2h_events[(i + 1) % 2]) - B_previous_d = B_d[(i + 1) % 2] - previous_B_event.record(stream=compute_stream) if i > 0: - h2d_stream.wait_event(previous_B_event) + h2d_stream.wait_event(d2h_events[(i + 1) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) + B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) h2d_events[(i - 1) % 2].record(stream=h2d_stream) From e59fd543233342c69a791b9fa59cc661259fb2f8 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:10:50 +0000 Subject: [PATCH 089/242] added previous B setup --- src/serinv/algs/pobtas.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 3ab6031e..67f52ba0 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -480,16 +480,7 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) - if n_diag_blocks > 1: - B_d[n_diag_blocks % 2].set( - arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], - stream=h2d_stream - ) - L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) - L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) - L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - - h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + @@ -521,6 +512,21 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) B_d[(n_diag_blocks - 1) % 2].get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) + previous_B_event.record(stream=d2h_stream) + + if n_diag_blocks > 1: + + B_d[n_diag_blocks % 2].set( + arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], + stream=h2d_stream + ) + L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) + L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) + L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) + h2d_stream.wait_event(previous_B_event) + B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) + + h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} From 50e728dba3f77780dfacd656ce32f521fe673526 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:12:46 +0000 Subject: [PATCH 090/242] fixed indexing --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 67f52ba0..2ae221da 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -524,7 +524,7 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_event) - B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) + B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) From 31233d087730b6939d55f9c8528bf68d2fc5f5a4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:17:31 +0000 Subject: [PATCH 091/242] moved brevious b from if --- src/serinv/algs/pobtas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 2ae221da..69b3415e 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -524,9 +524,10 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_event) - B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) + + B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) - h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} From 2133c4d43380c73601b7fd93b4e48c0a4e3b9a94 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:19:20 +0000 Subject: [PATCH 092/242] moved previous b from correct if --- src/serinv/algs/pobtas.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 69b3415e..e21ac8e4 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -524,10 +524,9 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_event) - - B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) + B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) - h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} @@ -540,9 +539,9 @@ def _pobtas_streaming( L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) - B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) + B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) - h2d_events[(i - 1) % 2].record(stream=h2d_stream) + h2d_events[(i - 1) % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) From 533af2a74560e93d586390a7c0055bb115c09e91 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:21:53 +0000 Subject: [PATCH 093/242] removed debug statements --- src/serinv/algs/pobtas.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index e21ac8e4..18420c83 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -539,15 +539,13 @@ def _pobtas_streaming( L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) + B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) h2d_events[(i - 1) % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - print(i) - print(B) - print(B_previous_d) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 2db72730a7c69616e05aaac022e93f500ef57c4d Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:22:50 +0000 Subject: [PATCH 094/242] moved a wait event --- src/serinv/algs/pobtas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 18420c83..5c0a8809 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -531,10 +531,9 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - if i > 0: - - h2d_stream.wait_event(d2h_events[(i + 1) % 2]) + h2d_stream.wait_event(d2h_events[(i + 1) % 2]) + if i > 0: B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) From 45b717937ff88316ed3b14723d38be5b09f17f87 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:27:24 +0000 Subject: [PATCH 095/242] delayed d2h stream --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 5c0a8809..aa57a128 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -557,6 +557,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) + d2h_stream.wait_event(compute_B_events[i % 2]) B_d[i % 2].get(out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From ab395e468420f89ad3bc8b36f8a00dfa796f7546 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:30:42 +0000 Subject: [PATCH 096/242] adjusted stream timing --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index aa57a128..5efe534e 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -539,6 +539,7 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) + h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) h2d_events[(i - 1) % 2].record(stream=h2d_stream) From d4f0128710c5a8c53c9a5f7277f995591f7fed60 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 29 Apr 2025 23:34:22 +0000 Subject: [PATCH 097/242] even more adjusted timing --- src/serinv/algs/pobtas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 5efe534e..5d810cfd 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -538,14 +538,16 @@ def _pobtas_streaming( L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) + h2d_events[(i - 1) % 2].record(stream=h2d_stream) h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) - - h2d_events[(i - 1) % 2].record(stream=h2d_stream) + previous_B_event.record(stream=d2h_stream) + with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) + h2d_stream.wait_event(previous_B_event) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From ba2d6acb1b4e92937d8d93aa87077a178d6b17d2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 12:09:15 +0000 Subject: [PATCH 098/242] changed streaming order --- src/serinv/algs/pobtas.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 5d810cfd..a40a476b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -530,24 +530,23 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - + with compute_stream: + compute_stream.wait_event(compute_B_events[(i - 1) % 2]) + compute_stream.wait_event(d2h_events[(i - 1) % 2]) + B_previous_d = B_d[(i - 1) % 2] + previous_B_event.record(stream=compute_stream) - h2d_stream.wait_event(d2h_events[(i + 1) % 2]) + if i > 0: + h2d_stream.wait_event(previous_B_event) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) h2d_events[(i - 1) % 2].record(stream=h2d_stream) - - h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d.set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) - previous_B_event.record(stream=d2h_stream) - with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - h2d_stream.wait_event(previous_B_event) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -560,9 +559,11 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) - d2h_stream.wait_event(compute_B_events[i % 2]) - B_d[i % 2].get(out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False) + d2h_stream.wait_event(previous_B_event) + B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) + + B_previous_d.get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") From 5efb03a00566029e007c8844f2f419ce63924f01 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 12:13:52 +0000 Subject: [PATCH 099/242] removed strange get --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index a40a476b..338c0a16 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -563,7 +563,7 @@ def _pobtas_streaming( B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - B_previous_d.get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + # B_previous_d.get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") From 66d2f6bfa4baef93c3b6fe698a351e324fb9760b Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 12:21:01 +0000 Subject: [PATCH 100/242] insert debug staetments --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 338c0a16..df4534e3 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -547,6 +547,8 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) + print(B_d) + print(B_previous_d) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From cd2b9c7e2ec1cb78389c9e349fb96e73bc7a1560 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 12:22:05 +0000 Subject: [PATCH 101/242] changed debug --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index df4534e3..640aeade 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -547,7 +547,7 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - print(B_d) + print(B) print(B_previous_d) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], From 9db7858e7777088ccfa516093f3d136c5204ea5e Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 12:31:31 +0000 Subject: [PATCH 102/242] changed last get --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 640aeade..32fe28f4 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -565,7 +565,7 @@ def _pobtas_streaming( B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - # B_previous_d.get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") From b37207be286536a66d9c8fe5c77ebc594ec820c3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 13:44:01 +0000 Subject: [PATCH 103/242] more debugging --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 32fe28f4..319d2106 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -533,7 +533,9 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(compute_B_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) + print(B_previous_d) B_previous_d = B_d[(i - 1) % 2] + print(B_previous_d) previous_B_event.record(stream=compute_stream) From ad6d37520c9f9c51c503f2aea6c12efa6ae19d59 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 15:28:53 +0000 Subject: [PATCH 104/242] changed B events --- src/serinv/algs/pobtas.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 319d2106..fef20ade 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -466,7 +466,7 @@ def _pobtas_streaming( # Events compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] - previous_B_event = cp.cuda.Event() + previous_B_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_events = [cp.cuda.Event(), cp.cuda.Event()] @@ -512,7 +512,7 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) B_d[(n_diag_blocks - 1) % 2].get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) - previous_B_event.record(stream=d2h_stream) + previous_B_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: @@ -523,7 +523,7 @@ def _pobtas_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - h2d_stream.wait_event(previous_B_event) + h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) @@ -536,11 +536,11 @@ def _pobtas_streaming( print(B_previous_d) B_previous_d = B_d[(i - 1) % 2] print(B_previous_d) - previous_B_event.record(stream=compute_stream) + previous_B_events[i % 2].record(stream=compute_stream) if i > 0: - h2d_stream.wait_event(previous_B_event) + h2d_stream.wait_event(previous_B_events[i % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -563,7 +563,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) - d2h_stream.wait_event(previous_B_event) + d2h_stream.wait_event(previous_B_events[i % 2]) B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From e83d0b888b12b2520afc5f31eb00b64fba5f5fbb Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 15:34:54 +0000 Subject: [PATCH 105/242] print B_d --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index fef20ade..22f5b0a0 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -550,6 +550,7 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) print(B) + print(B_d) print(B_previous_d) B_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], From 6fd9ff1c594b66f09725a1bcccd413097f4e0254 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 15:36:00 +0000 Subject: [PATCH 106/242] insert seperator print --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 22f5b0a0..b7eb8af0 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -530,6 +530,7 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + print("---") with compute_stream: compute_stream.wait_event(compute_B_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) From 3bc6718e741b10ceddd4f0657ae869d42f4b9d61 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 15:43:58 +0000 Subject: [PATCH 107/242] changed location of previous B event --- src/serinv/algs/pobtas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index b7eb8af0..2913037b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -512,7 +512,7 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) B_d[(n_diag_blocks - 1) % 2].get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) - previous_B_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) + if n_diag_blocks > 1: @@ -525,7 +525,7 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) - + previous_B_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): From ab3fd2a53733de7f7eda9e05303f2e2522329cc2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 15:46:56 +0000 Subject: [PATCH 108/242] changed order of compute stream --- src/serinv/algs/pobtas.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 2913037b..b310e36f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -531,13 +531,7 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} print("---") - with compute_stream: - compute_stream.wait_event(compute_B_events[(i - 1) % 2]) - compute_stream.wait_event(d2h_events[(i - 1) % 2]) - print(B_previous_d) - B_previous_d = B_d[(i - 1) % 2] - print(B_previous_d) - previous_B_events[i % 2].record(stream=compute_stream) + if i > 0: @@ -569,6 +563,14 @@ def _pobtas_streaming( B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) + with compute_stream: + compute_stream.wait_event(compute_B_events[i % 2]) + compute_stream.wait_event(d2h_events[i % 2]) + print(B_previous_d) + B_previous_d = B_d[(i - 1) % 2] + print(B_previous_d) + previous_B_events[i % 2].record(stream=compute_stream) + B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: From 5e72b089de522fcce97b92651ab94eaf24d8d0d4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 15:50:26 +0000 Subject: [PATCH 109/242] switched chose previous B --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index b310e36f..7b4b7a9b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -567,7 +567,7 @@ def _pobtas_streaming( compute_stream.wait_event(compute_B_events[i % 2]) compute_stream.wait_event(d2h_events[i % 2]) print(B_previous_d) - B_previous_d = B_d[(i - 1) % 2] + B_previous_d = B_d[i % 2] print(B_previous_d) previous_B_events[i % 2].record(stream=compute_stream) From 7155c8a29f36f22892d7fd6dea5b14ccdcd8aa6b Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 16:02:53 +0000 Subject: [PATCH 110/242] changed wait event --- src/serinv/algs/pobtas.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 7b4b7a9b..442e9858 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -535,7 +535,7 @@ def _pobtas_streaming( if i > 0: - h2d_stream.wait_event(previous_B_events[i % 2]) + h2d_stream.wait_event(previous_B_events[(i -1 ) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -564,7 +564,6 @@ def _pobtas_streaming( d2h_events[i % 2].record(stream=d2h_stream) with compute_stream: - compute_stream.wait_event(compute_B_events[i % 2]) compute_stream.wait_event(d2h_events[i % 2]) print(B_previous_d) B_previous_d = B_d[i % 2] From 09f31c3b6e327d463e41f051cac4df3f06275ca4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 16:03:13 +0000 Subject: [PATCH 111/242] changed another wait event --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 442e9858..6dc4d695 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -559,7 +559,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) - d2h_stream.wait_event(previous_B_events[i % 2]) + d2h_stream.wait_event(previous_B_events[(i - 1) % 2]) B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From d75b7ff7b49909b295a5bd74f29860e7d9a5418a Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 16:57:40 +0000 Subject: [PATCH 112/242] changed stream pattern --- src/serinv/algs/pobtas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 6dc4d695..f3095387 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -535,7 +535,7 @@ def _pobtas_streaming( if i > 0: - h2d_stream.wait_event(previous_B_events[(i -1 ) % 2]) + h2d_stream.wait_event(previous_B_events[(i - 1 ) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -559,7 +559,8 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) - d2h_stream.wait_event(previous_B_events[(i - 1) % 2]) + # d2h_stream.wait_event(previous_B_events[(i - 1) % 2]) + d2h_stream.wait_event(h2d_events[i % 2]) B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From 87fb54b8971afb804b893626e92d8bd943cca085 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 18:14:08 +0000 Subject: [PATCH 113/242] changed previous B --- src/serinv/algs/pobtas.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f3095387..99086fe5 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -461,7 +461,9 @@ def _pobtas_streaming( elif trans == "T" or trans == "C": # Buffers - B_previous_d = cp.empty_like(B_shape) + B_previous_d = cp.empty( + (2, *B_shape.shape), dtype=B_shape.dtype + ) del B_shape # Events @@ -524,7 +526,7 @@ def _pobtas_streaming( L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) - B_previous_d.set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) + B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) previous_B_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) @@ -535,7 +537,7 @@ def _pobtas_streaming( if i > 0: - h2d_stream.wait_event(previous_B_events[(i - 1 ) % 2]) + h2d_stream.wait_event(compute_B_events[(i - 1 ) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -544,14 +546,11 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[i % 2]) - print(B) - print(B_d) - print(B_previous_d) - B_d[i % 2] = cu_la.solve_triangular( + B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2].conj().T - @ B_previous_d + @ B_previous_d[(i - 1) % 2] - L_lower_arrow_blocks_d[i % 2].conj().T @ B_arrow_tip_d, lower=True, trans="C", @@ -559,19 +558,11 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) - # d2h_stream.wait_event(previous_B_events[(i - 1) % 2]) - d2h_stream.wait_event(h2d_events[i % 2]) - B_previous_d.get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - with compute_stream: - compute_stream.wait_event(d2h_events[i % 2]) - print(B_previous_d) - B_previous_d = B_d[i % 2] - print(B_previous_d) - previous_B_events[i % 2].record(stream=compute_stream) - - B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") From 464ca75d3e830f27ff8dd2935d3936b7f8e422cd Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 18:15:48 +0000 Subject: [PATCH 114/242] removed last B get --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 99086fe5..bee0774d 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -562,7 +562,7 @@ def _pobtas_streaming( B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + #B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") From ae2e2699803b34ca32edbfc75a1ef974934e02f6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:02:19 +0000 Subject: [PATCH 115/242] changed indexing --- src/serinv/algs/pobtas.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index bee0774d..c7058bd6 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -527,25 +527,21 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) - previous_B_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) - h2d_events[n_diag_blocks % 2].record(stream=h2d_stream) + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} print("---") - - - if i > 0: - h2d_stream.wait_event(compute_B_events[(i - 1 ) % 2]) + h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) - h2d_events[(i - 1) % 2].record(stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: - compute_stream.wait_event(h2d_events[i % 2]) + compute_stream.wait_event(h2d_events[(i - 1) % 2]) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 729be575b9cf91ab441d098c9575ef524461bfbe Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:05:40 +0000 Subject: [PATCH 116/242] changed streaming a bit --- src/serinv/algs/pobtas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index c7058bd6..60925305 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -542,11 +542,12 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) + compute_stream.wait_event(d2h_events[(i - 1) % 2]) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2].conj().T - @ B_previous_d[(i - 1) % 2] + @ B_previous_d[(i + 1) % 2] - L_lower_arrow_blocks_d[i % 2].conj().T @ B_arrow_tip_d, lower=True, trans="C", @@ -555,7 +556,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) #B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) From f801076781c680256e8c4dd327ad3ffcba0e127a Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:06:33 +0000 Subject: [PATCH 117/242] insert debug --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 60925305..6d47f232 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -543,6 +543,8 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) + print(B_d) + print(B_previous_d) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 44d858251b8e963a8d95adebe2e5870c576d2909 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:11:41 +0000 Subject: [PATCH 118/242] more debug --- src/serinv/algs/pobtas.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 6d47f232..0aed83c2 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -534,6 +534,7 @@ def _pobtas_streaming( print("---") if i > 0: h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) + print("h2d") B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -543,6 +544,7 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) + print("compute") print(B_d) print(B_previous_d) B_previous_d[i % 2] = cu_la.solve_triangular( @@ -558,6 +560,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) + print("d2h") B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From 0c816b0438faed7c507cb0b49db7da0c5a01e464 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:39:28 +0000 Subject: [PATCH 119/242] inser print B --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0aed83c2..8d77b73c 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -545,6 +545,7 @@ def _pobtas_streaming( compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) print("compute") + print(B) print(B_d) print(B_previous_d) B_previous_d[i % 2] = cu_la.solve_triangular( From b0a6473952de23f82f6d3f09fa3457b439b06c07 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:40:23 +0000 Subject: [PATCH 120/242] another print B --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 8d77b73c..79b832d3 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -566,7 +566,7 @@ def _pobtas_streaming( d2h_events[i % 2].record(stream=d2h_stream) #B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) - + print(B) else: raise ValueError(f"Invalid transpose argument: {trans}.") From a73f8d20361c6bbb82cbfafda62da52413c2fbec Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:44:46 +0000 Subject: [PATCH 121/242] print xref --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 4c51e79c..0824e72b 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -102,5 +102,6 @@ def test_pobtas( trans="C", device_streaming=True if array_type == "streaming" else False, ) + print(X_ref) assert xp.allclose(B, X_ref) From 39138d09b1b22bcc18d60db13b0bc7c1252a5de3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 19:45:28 +0000 Subject: [PATCH 122/242] more debug --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 0824e72b..e9ce2384 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -102,6 +102,7 @@ def test_pobtas( trans="C", device_streaming=True if array_type == "streaming" else False, ) + print("===") print(X_ref) assert xp.allclose(B, X_ref) From 8b74d46ad525825d447ba4a01ebdcd2d3eadcbb5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:01:07 +0000 Subject: [PATCH 123/242] another B_d print --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 79b832d3..8509927f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -562,6 +562,7 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) print("d2h") + print(B_previous_d) B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From 43daa6870babe68030a8660e6535345d7a05fa6f Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:02:18 +0000 Subject: [PATCH 124/242] insert last B d2h --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 8509927f..a5146f09 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -566,7 +566,7 @@ def _pobtas_streaming( B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - #B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) print(B) else: raise ValueError(f"Invalid transpose argument: {trans}.") From ac9f3d65434b29c564b383cc6c165f14105449f3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:03:12 +0000 Subject: [PATCH 125/242] condition last stream --- src/serinv/algs/pobtas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index a5146f09..08fbcc3e 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -566,7 +566,8 @@ def _pobtas_streaming( B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + if n_diag_blocks > 1: + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) print(B) else: raise ValueError(f"Invalid transpose argument: {trans}.") From a74bcbe949e02198c4f82f7eea2f1364bdcbb348 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:15:05 +0000 Subject: [PATCH 126/242] insert wait event for last stream --- src/serinv/algs/pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 08fbcc3e..ecf02748 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -567,6 +567,7 @@ def _pobtas_streaming( d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: + d2h_stream.wait_event(compute_B_events[0]) B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) print(B) else: From 3e9644bb195c9757d5006318c9b05ae109f0538d Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:17:41 +0000 Subject: [PATCH 127/242] backward solve working --- src/serinv/algs/pobtas.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index ecf02748..b34fe185 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -531,10 +531,8 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - print("---") if i > 0: h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) - print("h2d") B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) @@ -544,10 +542,6 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) - print("compute") - print(B) - print(B_d) - print(B_previous_d) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -561,15 +555,13 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - print("d2h") - print(B_previous_d) B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[0]) B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) - print(B) + else: raise ValueError(f"Invalid transpose argument: {trans}.") From 7f17c0fed37896d4f00e4590f56b1a9305f28d1d Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:20:09 +0000 Subject: [PATCH 128/242] bigger tests --- tests/tests_algs/regular/conftest.py | 1 + tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/conftest.py b/tests/tests_algs/regular/conftest.py index 239baed2..1a1d730f 100644 --- a/tests/tests_algs/regular/conftest.py +++ b/tests/tests_algs/regular/conftest.py @@ -9,6 +9,7 @@ pytest.param(2, id="n_diag_blocks=2"), pytest.param(3, id="n_diag_blocks=3"), pytest.param(4, id="n_diag_blocks=4"), + pytest.param(4, id="n_diag_blocks=20"), ] diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index e9ce2384..7e3c0991 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 10]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From 7f87fc73743065688366cac558d5240a2c1b5fe6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:20:55 +0000 Subject: [PATCH 129/242] even bigger tests --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 7e3c0991..2ad9d895 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 10]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 10, 40]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From 161613105b23c6382545cf842b2c16f92f6c7590 Mon Sep 17 00:00:00 2001 From: 03szust Date: Wed, 30 Apr 2025 20:21:34 +0000 Subject: [PATCH 130/242] reverted tests for now --- tests/tests_algs/regular/conftest.py | 1 - tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/tests_algs/regular/conftest.py b/tests/tests_algs/regular/conftest.py index 1a1d730f..239baed2 100644 --- a/tests/tests_algs/regular/conftest.py +++ b/tests/tests_algs/regular/conftest.py @@ -9,7 +9,6 @@ pytest.param(2, id="n_diag_blocks=2"), pytest.param(3, id="n_diag_blocks=3"), pytest.param(4, id="n_diag_blocks=4"), - pytest.param(4, id="n_diag_blocks=20"), ] diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 2ad9d895..e9ce2384 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 10, 40]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From 8335da7c384d1d2f9b9738c543b51bd6f0082aba Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 09:03:46 +0000 Subject: [PATCH 131/242] first attempt at adapted code for pobts --- src/serinv/algs/pobts.py | 124 ++++++++++++++++++ .../tests_algs/regular/tests_bt/test_pobts.py | 19 ++- 2 files changed, 142 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 3fbcd0d4..a9456cd0 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -4,6 +4,7 @@ from serinv import ( ArrayLike, _get_module_from_array, + _get_module_from_str, ) @@ -150,3 +151,126 @@ def _pobts_permuted( ) else: raise ValueError(f"Invalid transpose argument: {trans}.") + + +def _pobts_streaming( + L_diagonal_blocks: ArrayLike, + L_lower_diagonal_blocks: ArrayLike, + B: ArrayLike, + trans: str, +): + arr_module, _ = _get_module_from_array(arr=L_diagonal_blocks) + if arr_module.__name__ != "numpy": + raise NotImplementedError( + "Host<->Device streaming only works when host-arrays are given." + ) + + cp, cu_la = _get_module_from_str(module_str="cupy") + + # Vars + diag_blocksize = L_diagonal_blocks.shape[1] + n_diag_blocks = L_diagonal_blocks.shape[0] + + # Streams + compute_stream = cp.cuda.Stream(non_blocking=True) + h2d_stream = cp.cuda.Stream(non_blocking=True) + d2h_stream = cp.cuda.Stream(non_blocking=True) + + # Device Buffers + # B Buffers + B_shape = B[0 : diag_blocksize] + B_d = cp.empty( + (2, *B_shape.shape), dtype=B_shape.dtype + ) + B_previous_d = cp.empty( + (2, *B_shape.shape), dtype=B_shape.dtype + ) + + # L Buffers + L_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + L_lower_diagonal_blocks_d = cp.empty( + (2, *L_diagonal_blocks.shape[1:]), dtype=L_diagonal_blocks.dtype + ) + + # Events + compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] + previous_B_events = [cp.cuda.Event(), cp.cuda.Event()] + h2d_events = [cp.cuda.Event(), cp.cuda.Event()] + d2h_events = [cp.cuda.Event(), cp.cuda.Event()] + + if trans == "N": + raise NotImplementedError(f"Forward solve not implemented for streaming") + + elif trans == "T" or trans == "C": + print("hi") + + B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) + + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) + + with compute_stream: + B_d[(n_diag_blocks - 1) % 2] = ( + cu_la.solve_triangular( + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], + B_d[(n_diag_blocks - 1) % 2], + lower=True, + trans="C", + ) + ) + + compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) + + d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) + B_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize], stream=d2h_stream, blocking=False,) + d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) + + if n_diag_blocks > 1: + + B_d[n_diag_blocks % 2].set( + arr=B[-(2 * diag_blocksize) : -diag_blocksize], + stream=h2d_stream + ) + L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) + L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) + h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) + B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize], stream=h2d_stream) + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) + + for i in range(n_diag_blocks - 2, -1, -1): + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + if i > 0: + h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) + L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) + + with compute_stream: + compute_stream.wait_event(h2d_events[(i - 1) % 2]) + compute_stream.wait_event(d2h_events[(i - 1) % 2]) + B_previous_d[i % 2] = cu_la.solve_triangular( + L_diagonal_blocks_d[i % 2], + B_d[i % 2] + - L_lower_diagonal_blocks_d[i % 2].conj().T + @ B_previous_d[(i + 1) % 2], + lower=True, + trans="C", + ) + + compute_B_events[i % 2].record(compute_stream) + + d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + d2h_events[i % 2].record(stream=d2h_stream) + + if n_diag_blocks > 1: + d2h_stream.wait_event(compute_B_events[0]) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + + else: + raise ValueError(f"Invalid transpose argument: {trans}.") + + cp.cuda.Device().synchronize() \ No newline at end of file diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index 8125df52..fdc145b0 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -3,11 +3,13 @@ import numpy as np import pytest -from serinv import _get_module_from_array +from serinv import backend_flags, _get_module_from_array from ....testing_utils import bt_dense_to_arrays, dd_bt, symmetrize, rhs from serinv.algs import pobtf, pobts +if backend_flags["cupy_avail"]: + import cupyx as cpx @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) @@ -18,6 +20,8 @@ def test_pobts( array_type: str, dtype: np.dtype, ): + array_type = "streaming" + A = dd_bt( diagonal_blocksize, n_diag_blocks, @@ -47,6 +51,18 @@ def test_pobts( _, ) = bt_dense_to_arrays(A, diagonal_blocksize, n_diag_blocks) + if backend_flags["cupy_avail"] and array_type == "streaming": + A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks) + A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks[:, :, :] + A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks) + A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks[:, :, :] + B_pinned = cpx.zeros_like_pinned(B) + B_pinned[:, :] = B[:, :] + + A_diagonal_blocks = A_diagonal_blocks_pinned + A_lower_diagonal_blocks = A_lower_diagonal_blocks_pinned + B = B_pinned + pobtf( A_diagonal_blocks, A_lower_diagonal_blocks, @@ -66,6 +82,7 @@ def test_pobts( A_lower_diagonal_blocks, B, trans="C", + device_streaming=True if array_type == "streaming" else False, ) assert xp.allclose(B, X_ref) From 46627ec2d6ca4f01d1483ef18333abed74a8b245 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 09:06:23 +0000 Subject: [PATCH 132/242] removed not implemented error --- src/serinv/algs/pobts.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index a9456cd0..20028427 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -42,8 +42,11 @@ def pobts( else: # Natural arrowhead if device_streaming: - raise NotImplementedError( - "Streaming is not implemented for the natural arrowhead." + _pobts_streaming( + L_diagonal_blocks, + L_lower_diagonal_blocks, + B, + trans, ) else: _pobts( From 43afebe60776aac30e11f8df598874b9a81637ca Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 09:09:11 +0000 Subject: [PATCH 133/242] insert debug --- src/serinv/algs/pobts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 20028427..61788335 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -207,7 +207,8 @@ def _pobts_streaming( raise NotImplementedError(f"Forward solve not implemented for streaming") elif trans == "T" or trans == "C": - print("hi") + print(B_d) + print(B) B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) From 1b05487ecb0b11c050a006b0d337004496ab8144 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 09:10:57 +0000 Subject: [PATCH 134/242] fixed array slicing --- src/serinv/algs/pobts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 61788335..f1c80e93 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -210,7 +210,7 @@ def _pobts_streaming( print(B_d) print(B) - B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize], stream=h2d_stream) + B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) @@ -228,7 +228,7 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize], stream=d2h_stream, blocking=False,) + B_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False,) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: @@ -240,7 +240,7 @@ def _pobts_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) - B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize], stream=h2d_stream) + B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): From 34d6577edad2867ef5d6516cceec3be103e0a282 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 09:15:26 +0000 Subject: [PATCH 135/242] pobts streaming working --- src/serinv/algs/pobts.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index f1c80e93..4a1c80f7 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -207,9 +207,6 @@ def _pobts_streaming( raise NotImplementedError(f"Forward solve not implemented for streaming") elif trans == "T" or trans == "C": - print(B_d) - print(B) - B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) From 42e215e712158fabf3747bd94173110e469ece4e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 09:59:41 +0000 Subject: [PATCH 136/242] first attempt at pobts forward streaming by flipping it --- src/serinv/algs/pobts.py | 63 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 4a1c80f7..47d74bc8 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -204,7 +204,68 @@ def _pobts_streaming( d2h_events = [cp.cuda.Event(), cp.cuda.Event()] if trans == "N": - raise NotImplementedError(f"Forward solve not implemented for streaming") + B_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) + + h2d_events[0].record(stream=h2d_stream) + + with compute_stream: + B_d[0] = ( + cu_la.solve_triangular( + L_diagonal_blocks_d[0], + B_d[0], + lower=True, + trans="C", + ) + ) + + d2h_stream.wait_event(compute_B_events[0]) + B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) + d2h_events[0].record(stream=d2h_stream) + + if n_diag_blocks > 1: + + B_d[1].set( + arr=B[diag_blocksize : (2 * diag_blocksize)], + stream=h2d_stream + ) + L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[1], stream=h2d_stream) + h2d_stream.wait_event(previous_B_events[0]) + B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) + h2d_events[0].record(stream=h2d_stream) + + for i in range(1, n_diag_blocks - 1): + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + if i + 1 < n_diag_blocks - 1: + h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) + L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) + + with compute_stream: + compute_stream.wait_event(h2d_events[(i + 1) % 2]) + compute_stream.wait_event(d2h_events[(i + 1) % 2]) + B_previous_d[i % 2] = cu_la.solve_triangular( + L_diagonal_blocks_d[i % 2], + B_d[i % 2] + - L_lower_diagonal_blocks_d[i % 2].conj().T + @ B_previous_d[(i - 1) % 2], + lower=True, + trans="C", + ) + + compute_B_events[i % 2].record(compute_stream) + + d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_previous_d[(i + 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) + d2h_events[i % 2].record(stream=d2h_stream) + + if n_diag_blocks > 1: + d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) + B_previous_d[n_diag_blocks - 1].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + elif trans == "T" or trans == "C": B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) From abe2879d2a682980064a900bbef58f60cca3bf59 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:00:13 +0000 Subject: [PATCH 137/242] added test logic --- tests/tests_algs/regular/tests_bt/test_pobts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index fdc145b0..f5c941dc 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -74,6 +74,7 @@ def test_pobts( A_lower_diagonal_blocks, B, trans="N", + device_streaming=True if array_type == "streaming" else False, ) # Backward solve: X=L^{-T}Y From 96652d59a7ac72a7aef31c9ef8e2b8949c1ed076 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:01:44 +0000 Subject: [PATCH 138/242] changed indexing --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 47d74bc8..da666433 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -264,7 +264,7 @@ def _pobts_streaming( if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_previous_d[n_diag_blocks - 1].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) elif trans == "T" or trans == "C": From e6ce6c48f1a4df0b81edefc82e1912ef1b0d7240 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:06:28 +0000 Subject: [PATCH 139/242] fixed more indexing --- src/serinv/algs/pobts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index da666433..0552c974 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -230,12 +230,12 @@ def _pobts_streaming( stream=h2d_stream ) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[1], stream=h2d_stream) + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) - for i in range(1, n_diag_blocks - 1): + for i in range(0, n_diag_blocks - 1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} if i + 1 < n_diag_blocks - 1: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) From ea29b8ed5b21faefb6578972f6b65168100cdd30 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:10:16 +0000 Subject: [PATCH 140/242] switched event order --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 0552c974..cc646da1 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -230,7 +230,7 @@ def _pobts_streaming( stream=h2d_stream ) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) From 6fab517f2a1e47c87cc57ab29a11e5fbf8c9d9fb Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:17:30 +0000 Subject: [PATCH 141/242] changed first block logic --- src/serinv/algs/pobts.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index cc646da1..9fa7ce33 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -215,10 +215,11 @@ def _pobts_streaming( L_diagonal_blocks_d[0], B_d[0], lower=True, - trans="C", ) ) + compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) + d2h_stream.wait_event(compute_B_events[0]) B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) d2h_events[0].record(stream=d2h_stream) @@ -230,7 +231,7 @@ def _pobts_streaming( stream=h2d_stream ) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) From 5e87ead868e4fc17ff18a7ca959b03355abacf2e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:19:36 +0000 Subject: [PATCH 142/242] fixed solve --- src/serinv/algs/pobts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 9fa7ce33..5196e26c 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -254,7 +254,6 @@ def _pobts_streaming( - L_lower_diagonal_blocks_d[i % 2].conj().T @ B_previous_d[(i - 1) % 2], lower=True, - trans="C", ) compute_B_events[i % 2].record(compute_stream) From 82df44546e93285034e4c875fc9d55803c90868a Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:20:37 +0000 Subject: [PATCH 143/242] insert debug statement --- src/serinv/algs/pobts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 5196e26c..ea8876aa 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -259,6 +259,7 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) + print(B_previous_d) B_previous_d[(i + 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From 93976bd3567617b4c01abd3642e6e742d0e97d11 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:24:07 +0000 Subject: [PATCH 144/242] changed lower diagonal order --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index ea8876aa..02a90690 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -231,7 +231,7 @@ def _pobts_streaming( stream=h2d_stream ) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) From 72d1b8368e731e80a88e8f7105dba0059706a3bd Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:26:02 +0000 Subject: [PATCH 145/242] inser debug message --- src/serinv/algs/pobts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 02a90690..3687cb11 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -260,6 +260,7 @@ def _pobts_streaming( d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) print(B_previous_d) + print(B[(i - 1) * diag_blocksize : i * diag_blocksize]) B_previous_d[(i + 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From bf8077edcf4f0b56e5cc35982b981f8f9be8b279 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 10:28:30 +0000 Subject: [PATCH 146/242] changed slicing --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 3687cb11..53b011a9 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -261,7 +261,7 @@ def _pobts_streaming( d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) print(B_previous_d) print(B[(i - 1) * diag_blocksize : i * diag_blocksize]) - B_previous_d[(i + 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[(i + 1) % 2].get(out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: From 046dffcafa662aff0d12f7a396b50e60d1ea41a5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:24:08 +0000 Subject: [PATCH 147/242] adjusted loop --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 53b011a9..143f4d49 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -236,7 +236,7 @@ def _pobts_streaming( B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) - for i in range(0, n_diag_blocks - 1): + for i in range(1, n_diag_blocks - 1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} if i + 1 < n_diag_blocks - 1: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) From f3bc5856a1690795650eac6f63adf1c0b992c8f2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:25:44 +0000 Subject: [PATCH 148/242] adjusted loop --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 143f4d49..041ad9fb 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -252,7 +252,7 @@ def _pobts_streaming( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2].conj().T - @ B_previous_d[(i - 1) % 2], + @ B_previous_d[i % 2], lower=True, ) From dfbf23b00e07db1fc0c47f7f7e0d2965377223ed Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:35:27 +0000 Subject: [PATCH 149/242] changed previous B --- src/serinv/algs/pobts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 041ad9fb..972d81ff 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -233,11 +233,12 @@ def _pobts_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) - B_previous_d[0].set(arr=B[-diag_blocksize:], stream=h2d_stream) + B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) for i in range(1, n_diag_blocks - 1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + if i + 1 < n_diag_blocks - 1: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) From c59634a214fb2be022966040bc043bd89a431c2d Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:37:52 +0000 Subject: [PATCH 150/242] insert debug check 1 --- src/serinv/algs/pobts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 972d81ff..6f267df0 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -249,6 +249,8 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) + print(L_diagonal_blocks) + print(L_diagonal_blocks_d) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -260,8 +262,6 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - print(B_previous_d) - print(B[(i - 1) * diag_blocksize : i * diag_blocksize]) B_previous_d[(i + 1) % 2].get(out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) From 5f917064e1751cc41ca89f0862d66cd311134127 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:39:21 +0000 Subject: [PATCH 151/242] adjusted streaming --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 6f267df0..539ec20f 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -239,7 +239,7 @@ def _pobts_streaming( for i in range(1, n_diag_blocks - 1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - if i + 1 < n_diag_blocks - 1: + if i < n_diag_blocks - 1: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) From adc84f87abb8a92820a91bd13f5d67e85164b083 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:40:15 +0000 Subject: [PATCH 152/242] adjusted streaming --- src/serinv/algs/pobts.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 539ec20f..94675fe6 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -239,12 +239,13 @@ def _pobts_streaming( for i in range(1, n_diag_blocks - 1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - if i < n_diag_blocks - 1: - h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) - L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) + + h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) + if i + 1 < n_diag_blocks - 1: L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) - h2d_events[i % 2].record(stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) From 1409c5d4cb3db62433df7ac266f7fa0f00b7a4d7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:41:44 +0000 Subject: [PATCH 153/242] insert more debug --- src/serinv/algs/pobts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 94675fe6..edf61cbf 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -252,6 +252,7 @@ def _pobts_streaming( compute_stream.wait_event(d2h_events[(i + 1) % 2]) print(L_diagonal_blocks) print(L_diagonal_blocks_d) + print(i % 2) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 6e13d6f01ef94e96bae47aa40c250f828ee24782 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:42:45 +0000 Subject: [PATCH 154/242] expanded for loop --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index edf61cbf..9f2101a2 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -236,7 +236,7 @@ def _pobts_streaming( B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) - for i in range(1, n_diag_blocks - 1): + for i in range(1, n_diag_blocks): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} From d856785f66341c501040e645d8fb9ec390a7efe0 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:43:23 +0000 Subject: [PATCH 155/242] adjusted streaming --- src/serinv/algs/pobts.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 9f2101a2..cd37581d 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -239,13 +239,13 @@ def _pobts_streaming( for i in range(1, n_diag_blocks): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - - h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) - L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) - if i + 1 < n_diag_blocks - 1: - L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) - h2d_events[i % 2].record(stream=h2d_stream) + if i < n_diag_blocks - 1: + h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) + L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) + if i + 1 < n_diag_blocks - 1: + L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) From f46a64a1c6a18c49f101cad648cdfa0eebc9696b Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:44:34 +0000 Subject: [PATCH 156/242] check number 2 --- src/serinv/algs/pobts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index cd37581d..e407374b 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -250,8 +250,8 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) - print(L_diagonal_blocks) - print(L_diagonal_blocks_d) + print(L_lower_diagonal_blocks) + print(L_lower_diagonal_blocks_d) print(i % 2) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], From 871a3b75f8236c9c60c89afdaedbadcca32113fe Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:49:05 +0000 Subject: [PATCH 157/242] shifted indexing --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index e407374b..b307daee 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -233,7 +233,7 @@ def _pobts_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) - B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) + B_previous_d[1].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) for i in range(1, n_diag_blocks): From 6f4971cee1c7d8ec8c32dd983b54b5238af06c8e Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 12:53:54 +0000 Subject: [PATCH 158/242] changed lower streaming --- src/serinv/algs/pobts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index b307daee..dbd74e0f 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -243,8 +243,7 @@ def _pobts_streaming( h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) - if i + 1 < n_diag_blocks - 1: - L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream) + L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i], stream=h2d_stream) h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: From 8307f51b509bdacfcaa373fb3eadc06cce36c680 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:06:30 +0000 Subject: [PATCH 159/242] more debug --- src/serinv/algs/pobts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index dbd74e0f..bd1fbfc9 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -239,7 +239,8 @@ def _pobts_streaming( for i in range(1, n_diag_blocks): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} - if i < n_diag_blocks - 1: + if i + 1 < n_diag_blocks: + print(i) h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) From 742dcd390983d3712285b6916d2a9815490f6cce Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:09:29 +0000 Subject: [PATCH 160/242] removed some debug --- src/serinv/algs/pobts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index bd1fbfc9..3a61135f 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -240,7 +240,6 @@ def _pobts_streaming( # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} if i + 1 < n_diag_blocks: - print(i) h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) From c4f3fed9f010702b605791e470e0e01fc8ea9901 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:10:41 +0000 Subject: [PATCH 161/242] debug number 3 --- src/serinv/algs/pobts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 3a61135f..d6e60a9c 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -249,8 +249,8 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) - print(L_lower_diagonal_blocks) - print(L_lower_diagonal_blocks_d) + print(B) + print(B_d) print(i % 2) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], From 620ee3b91b8c1e52d00a31b13023470736d58b27 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:52:21 +0000 Subject: [PATCH 162/242] changed B streaming --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index d6e60a9c..97584795 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -241,7 +241,7 @@ def _pobts_streaming( if i + 1 < n_diag_blocks: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2].set(arr=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=h2d_stream) + B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i], stream=h2d_stream) h2d_events[i % 2].record(stream=h2d_stream) From 82b8190a42e19e68a6992cacb49295b1e23c9504 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:54:06 +0000 Subject: [PATCH 163/242] more changes to B streaming --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 97584795..0dd1a44f 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -233,7 +233,7 @@ def _pobts_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) - B_previous_d[1].set(arr=B[:diag_blocksize], stream=h2d_stream) + B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) for i in range(1, n_diag_blocks): From f88aad87e509d2c05922c080b5ce15467a86a2fd Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:55:10 +0000 Subject: [PATCH 164/242] changed B previous --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 0dd1a44f..71092252 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -256,7 +256,7 @@ def _pobts_streaming( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2].conj().T - @ B_previous_d[i % 2], + @ B_previous_d[(i + 1) % 2], lower=True, ) From 9e401ac6b8173f64a191ceb0547b9545fbb3e06b Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 13:58:04 +0000 Subject: [PATCH 165/242] removed wrong transposition --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 71092252..714706b1 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -255,7 +255,7 @@ def _pobts_streaming( B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] - - L_lower_diagonal_blocks_d[i % 2].conj().T + - L_lower_diagonal_blocks_d[i % 2] @ B_previous_d[(i + 1) % 2], lower=True, ) From 7c24151821669e4ba3760e2081480a7dea5810f6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 14:44:07 +0000 Subject: [PATCH 166/242] debug check 4 --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 714706b1..edb1dbaf 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -250,7 +250,7 @@ def _pobts_streaming( compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) print(B) - print(B_d) + print(B_previous_d) print(i % 2) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], From eff1bdab4635c2409fa473280ed51231fa378932 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 14:56:17 +0000 Subject: [PATCH 167/242] debug b previous --- src/serinv/algs/pobts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index edb1dbaf..38512951 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -302,6 +302,7 @@ def _pobts_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) + print(B_previous_d) B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) From c021ca075bed0747a83f8ed07c8e9ab8b6e18fb7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 14:57:12 +0000 Subject: [PATCH 168/242] moved debug message --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 38512951..f5127092 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -233,6 +233,7 @@ def _pobts_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) + print(B_previous_d) B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) @@ -302,7 +303,6 @@ def _pobts_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) - print(B_previous_d) B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) From 77941229c3f4b30a19934985a58c07d152d9fe7f Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:00:05 +0000 Subject: [PATCH 169/242] shift B previous get --- src/serinv/algs/pobts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index f5127092..b2641b20 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -257,14 +257,14 @@ def _pobts_streaming( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2] - @ B_previous_d[(i + 1) % 2], + @ B_previous_d[(i - 1) % 2], lower=True, ) compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i + 1) % 2].get(out=B[i * diag_blocksize : (i + 1) * diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[(i - 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: From 3e6d3c3246162eb62279ab4b9cd620aa82e63131 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:03:04 +0000 Subject: [PATCH 170/242] changed last B --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index b2641b20..4f3ee4a5 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -268,7 +268,7 @@ def _pobts_streaming( d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: - d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) + d2h_stream.wait_event(compute_B_events[(n_diag_blocks) % 2]) B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) From e5fb88a7130d5e17ceb9731e1710dfce100e80c7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:07:26 +0000 Subject: [PATCH 171/242] test for last B --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 4f3ee4a5..c564df58 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -269,7 +269,7 @@ def _pobts_streaming( if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[(n_diag_blocks) % 2]) - B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + # B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) elif trans == "T" or trans == "C": From 7eeb5c198ea2cc5d832ed83f22bd1d445077f3c3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:07:47 +0000 Subject: [PATCH 172/242] revert --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index c564df58..4f3ee4a5 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -269,7 +269,7 @@ def _pobts_streaming( if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[(n_diag_blocks) % 2]) - # B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) elif trans == "T" or trans == "C": From 2afe74b81e798360f96ffb23ce9e9cebdbe9fb45 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:10:16 +0000 Subject: [PATCH 173/242] try different stream order --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 4f3ee4a5..7e5f5564 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -235,7 +235,7 @@ def _pobts_streaming( h2d_stream.wait_event(previous_B_events[0]) print(B_previous_d) B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) - h2d_events[0].record(stream=h2d_stream) + h2d_events[1].record(stream=h2d_stream) for i in range(1, n_diag_blocks): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} From c06b8496b434fb6620ff8e5d7e839d1217c1a666 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:13:11 +0000 Subject: [PATCH 174/242] insert failsafe --- src/serinv/algs/pobts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 7e5f5564..d935de1d 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -210,6 +210,7 @@ def _pobts_streaming( h2d_events[0].record(stream=h2d_stream) with compute_stream: + compute_stream.wait_event(h2d_events[0]) B_d[0] = ( cu_la.solve_triangular( L_diagonal_blocks_d[0], @@ -235,7 +236,7 @@ def _pobts_streaming( h2d_stream.wait_event(previous_B_events[0]) print(B_previous_d) B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) - h2d_events[1].record(stream=h2d_stream) + h2d_events[0].record(stream=h2d_stream) for i in range(1, n_diag_blocks): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} From 01f67d16e744d76f43b5a1b5b2a72e4d01d2fe22 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:17:01 +0000 Subject: [PATCH 175/242] more failsafe --- src/serinv/algs/pobts.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index d935de1d..f2885162 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -207,10 +207,10 @@ def _pobts_streaming( B_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) - h2d_events[0].record(stream=h2d_stream) + h2d_events[1].record(stream=h2d_stream) with compute_stream: - compute_stream.wait_event(h2d_events[0]) + compute_stream.wait_event(h2d_events[1]) B_d[0] = ( cu_la.solve_triangular( L_diagonal_blocks_d[0], @@ -234,7 +234,6 @@ def _pobts_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_stream.wait_event(previous_B_events[0]) - print(B_previous_d) B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) From 5be4c6fbc941094cd3927d2c00cb6fbd46ad1b61 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:19:48 +0000 Subject: [PATCH 176/242] removed unnecessary events --- src/serinv/algs/pobts.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index f2885162..1d852b1a 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -233,7 +233,6 @@ def _pobts_streaming( ) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - h2d_stream.wait_event(previous_B_events[0]) B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) @@ -302,7 +301,6 @@ def _pobts_streaming( ) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) From e65378058923dc8fb1799cd0adcd06c08c7558c4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:22:51 +0000 Subject: [PATCH 177/242] stream failsafes --- src/serinv/algs/pobts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 1d852b1a..5e110f9e 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -199,7 +199,6 @@ def _pobts_streaming( # Events compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] - previous_B_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_events = [cp.cuda.Event(), cp.cuda.Event()] @@ -226,7 +225,7 @@ def _pobts_streaming( d2h_events[0].record(stream=d2h_stream) if n_diag_blocks > 1: - + h2d_stream.wait_event(d2h_events[0]) B_d[1].set( arr=B[diag_blocksize : (2 * diag_blocksize)], stream=h2d_stream From 93b669a425e536ed745f3ab919dd8299a7d60b02 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:23:54 +0000 Subject: [PATCH 178/242] more failsafe --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 5e110f9e..18ed355e 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -293,7 +293,7 @@ def _pobts_streaming( d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: - + h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) B_d[n_diag_blocks % 2].set( arr=B[-(2 * diag_blocksize) : -diag_blocksize], stream=h2d_stream From c6fc65fb8a28f39d3ebf95efc5efbd08879a6795 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:31:26 +0000 Subject: [PATCH 179/242] changed faulty event --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 18ed355e..3d108daf 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -218,7 +218,7 @@ def _pobts_streaming( ) ) - compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) + compute_B_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_B_events[0]) B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) From 12ca6405cc0c42c54f2c9d437e9b089eef349a01 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:34:25 +0000 Subject: [PATCH 180/242] changed last stream --- src/serinv/algs/pobts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 3d108daf..e7539abe 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -266,7 +266,7 @@ def _pobts_streaming( d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: - d2h_stream.wait_event(compute_B_events[(n_diag_blocks) % 2]) + d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) From 5efb2885b028881824279f26e80c41a281891167 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:35:57 +0000 Subject: [PATCH 181/242] removed unnecessary events --- src/serinv/algs/pobtas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index b34fe185..0d09698c 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -468,7 +468,6 @@ def _pobtas_streaming( # Events compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] - previous_B_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_events = [cp.cuda.Event(), cp.cuda.Event()] @@ -525,7 +524,6 @@ def _pobtas_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - h2d_stream.wait_event(previous_B_events[(n_diag_blocks - 1) % 2]) B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) From 18943347fa62c362855fca497a489baba9640034 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:39:03 +0000 Subject: [PATCH 182/242] more parity --- src/serinv/algs/pobtas.py | 4 ++-- src/serinv/algs/pobts.py | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0d09698c..4f93df73 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -544,7 +544,7 @@ def _pobtas_streaming( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2].conj().T - @ B_previous_d[(i + 1) % 2] + @ B_previous_d[(i - 1) % 2] - L_lower_arrow_blocks_d[i % 2].conj().T @ B_arrow_tip_d, lower=True, trans="C", @@ -553,7 +553,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index e7539abe..2c04df7a 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -248,26 +248,23 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) - print(B) - print(B_previous_d) - print(i % 2) B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2] - @ B_previous_d[(i - 1) % 2], + @ B_previous_d[(i + 1) % 2], lower=True, ) compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i - 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[(i + 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: - d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + d2h_stream.wait_event(compute_B_events[(n_diag_blocks + 1) % 2]) + B_previous_d[(n_diag_blocks + 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) elif trans == "T" or trans == "C": @@ -277,6 +274,7 @@ def _pobts_streaming( h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) with compute_stream: + compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) B_d[(n_diag_blocks - 1) % 2] = ( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], @@ -319,7 +317,7 @@ def _pobts_streaming( L_diagonal_blocks_d[i % 2], B_d[i % 2] - L_lower_diagonal_blocks_d[i % 2].conj().T - @ B_previous_d[(i + 1) % 2], + @ B_previous_d[(i - 1) % 2], lower=True, trans="C", ) @@ -327,7 +325,7 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i + 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: From 9db222d33fb8ae2b0c6fb327ec5bab7ab8f3a76c Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:40:21 +0000 Subject: [PATCH 183/242] more failsafes --- src/serinv/algs/pobtas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 4f93df73..0492802b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -516,7 +516,7 @@ def _pobtas_streaming( if n_diag_blocks > 1: - + h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) B_d[n_diag_blocks % 2].set( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream From db2928dc25f26d1b986107d09014949e7abc38e2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:46:42 +0000 Subject: [PATCH 184/242] cosmetic changes --- src/serinv/algs/pobtas.py | 56 ++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0492802b..556f267f 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -429,34 +429,46 @@ def _pobtas_streaming( # arrow tip block of the RHS. h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) + h2d_diagonal_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) + h2d_arrow_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_diagonal_events[(n_diag_blocks - 1) % 2]) - B_d[(n_diag_blocks - 1) % 2] = (cu_la.solve_triangular(L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2], lower=True,)) + B_d[(n_diag_blocks - 1) % 2] = cu_la.solve_triangular( + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], + B_d[(n_diag_blocks - 1) % 2], + lower=True + ) compute_partial_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[0]) - B_d[(n_diag_blocks - 1) % 2].get(out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], stream=d2h_stream, blocking=False,) + + B_d[(n_diag_blocks - 1) % 2].get( + out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], + stream=d2h_stream, + blocking=False + ) + d2h_B_events[0].record(stream=d2h_stream) with compute_stream: compute_stream.wait_event(h2d_arrow_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d -= (L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2] @ B_d[(n_diag_blocks - 1) % 2]) + B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) + compute_partial_events[1].record(stream=compute_stream) - compute_stream.wait_event(compute_partial_events[1]) - B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) - compute_partial_events[0].record(stream=compute_stream) + d2h_stream.wait_event(compute_partial_events[1]) - d2h_stream.wait_event(compute_partial_events[0]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) elif trans == "T" or trans == "C": @@ -476,7 +488,10 @@ def _pobtas_streaming( B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) - B_d[(n_diag_blocks - 1) % 2].set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) + B_d[(n_diag_blocks - 1) % 2].set( + arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], + stream=h2d_stream + ) L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) @@ -510,36 +525,50 @@ def _pobtas_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - B_d[(n_diag_blocks - 1) % 2].get(out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False,) + B_d[(n_diag_blocks - 1) % 2].get( + out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], + stream=d2h_stream, + blocking=False + + ) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) + B_d[n_diag_blocks % 2].set( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) + B_previous_d[(n_diag_blocks - 1) % 2].set( + arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], + stream=h2d_stream + ) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=h2d_stream) + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} if i > 0: h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) + B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -553,11 +582,18 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + + B_previous_d[(i - 1) % 2].get( + out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream=d2h_stream, + blocking=False + + ) d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[0]) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: From 47c9f5cffe2e04c19fe524024020de2d6f2c8955 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:54:49 +0000 Subject: [PATCH 185/242] more cosmetic changes --- src/serinv/algs/pobtas.py | 2 ++ src/serinv/algs/pobts.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 556f267f..f4d692c6 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -420,7 +420,9 @@ def _pobtas_streaming( compute_arrow_B_events[i % 2].record(stream=compute_stream) d2h_stream.wait_event(compute_arrow_B_events[i % 2]) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + d2h_tip_events[i % 2].record(stream=d2h_stream) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 2c04df7a..351838d6 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -188,6 +188,7 @@ def _pobts_streaming( B_previous_d = cp.empty( (2, *B_shape.shape), dtype=B_shape.dtype ) + del B_shape # L Buffers L_diagonal_blocks_d = cp.empty( @@ -210,6 +211,7 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[1]) + B_d[0] = ( cu_la.solve_triangular( L_diagonal_blocks_d[0], @@ -221,11 +223,14 @@ def _pobts_streaming( compute_B_events[0].record(stream=compute_stream) d2h_stream.wait_event(compute_B_events[0]) + B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) + d2h_events[0].record(stream=d2h_stream) if n_diag_blocks > 1: h2d_stream.wait_event(d2h_events[0]) + B_d[1].set( arr=B[diag_blocksize : (2 * diag_blocksize)], stream=h2d_stream @@ -233,6 +238,7 @@ def _pobts_streaming( L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) + h2d_events[0].record(stream=h2d_stream) for i in range(1, n_diag_blocks): @@ -240,14 +246,17 @@ def _pobts_streaming( if i + 1 < n_diag_blocks: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i], stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) + B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -259,11 +268,18 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i + 1) % 2].get(out=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=d2h_stream, blocking=False) + + B_previous_d[(i + 1) % 2].get( + out=B[(i - 1) * diag_blocksize : i * diag_blocksize], + stream=d2h_stream, + blocking=False + ) + d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[(n_diag_blocks + 1) % 2]) + B_previous_d[(n_diag_blocks + 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) @@ -275,6 +291,7 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) + B_d[(n_diag_blocks - 1) % 2] = ( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], @@ -287,11 +304,14 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False,) + + B_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) + B_d[n_diag_blocks % 2].set( arr=B[-(2 * diag_blocksize) : -diag_blocksize], stream=h2d_stream @@ -299,20 +319,24 @@ def _pobts_streaming( L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} if i > 0: h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) + h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) + B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -325,11 +349,14 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) + B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) + d2h_events[i % 2].record(stream=d2h_stream) if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[0]) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: From 3ba9b9fc6bc720c629bb3b86509533b26bd3f921 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:55:58 +0000 Subject: [PATCH 186/242] attempt to reduce streaming --- src/serinv/algs/pobts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 351838d6..67fb2c95 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -292,7 +292,7 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) - B_d[(n_diag_blocks - 1) % 2] = ( + B_previous_d[(n_diag_blocks - 1) % 2] = ( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2], @@ -305,7 +305,7 @@ def _pobts_streaming( d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) @@ -318,7 +318,7 @@ def _pobts_streaming( ) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) + #B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) From 3d9e334be789fb12f00989ed4cd8bc89b9a13de2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 15:59:26 +0000 Subject: [PATCH 187/242] reduced streaming --- src/serinv/algs/pobtas.py | 8 ++------ src/serinv/algs/pobts.py | 6 ++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f4d692c6..0744e7df 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -514,7 +514,7 @@ def _pobtas_streaming( trans="C", ) - B_d[(n_diag_blocks - 1) % 2] = ( + B_previous_d[(n_diag_blocks - 1) % 2] = ( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2] @@ -529,7 +529,7 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - B_d[(n_diag_blocks - 1) % 2].get( + B_previous_d[(n_diag_blocks - 1) % 2].get( out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], stream=d2h_stream, blocking=False @@ -545,10 +545,6 @@ def _pobtas_streaming( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], stream=h2d_stream ) - B_previous_d[(n_diag_blocks - 1) % 2].set( - arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], - stream=h2d_stream - ) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 67fb2c95..30aac43c 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -212,7 +212,7 @@ def _pobts_streaming( with compute_stream: compute_stream.wait_event(h2d_events[1]) - B_d[0] = ( + B_previous_d[0] = ( cu_la.solve_triangular( L_diagonal_blocks_d[0], B_d[0], @@ -224,7 +224,7 @@ def _pobts_streaming( d2h_stream.wait_event(compute_B_events[0]) - B_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) d2h_events[0].record(stream=d2h_stream) @@ -237,7 +237,6 @@ def _pobts_streaming( ) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - B_previous_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) h2d_events[0].record(stream=h2d_stream) @@ -318,7 +317,6 @@ def _pobts_streaming( ) L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - #B_previous_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) From 8bf1908c9a95986bd34d2b35c878da629fcef7f4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:01:14 +0000 Subject: [PATCH 188/242] attempt to reduce streaming --- src/serinv/algs/pobts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 30aac43c..5627d6dd 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -302,9 +302,9 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) - d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) + #d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + #B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) @@ -352,7 +352,7 @@ def _pobts_streaming( d2h_events[i % 2].record(stream=d2h_stream) - if n_diag_blocks > 1: + if n_diag_blocks > 0: d2h_stream.wait_event(compute_B_events[0]) B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) From 25c9e56d885bd4f5d2e963194a43da651b204fdd Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:02:40 +0000 Subject: [PATCH 189/242] parity reduced streaming --- src/serinv/algs/pobts.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 5627d6dd..dd3459b2 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -222,9 +222,9 @@ def _pobts_streaming( compute_B_events[0].record(stream=compute_stream) - d2h_stream.wait_event(compute_B_events[0]) + #d2h_stream.wait_event(compute_B_events[0]) - B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) + #B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) d2h_events[0].record(stream=d2h_stream) @@ -276,10 +276,9 @@ def _pobts_streaming( d2h_events[i % 2].record(stream=d2h_stream) - if n_diag_blocks > 1: - d2h_stream.wait_event(compute_B_events[(n_diag_blocks + 1) % 2]) + d2h_stream.wait_event(compute_B_events[(n_diag_blocks + 1) % 2]) - B_previous_d[(n_diag_blocks + 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) + B_previous_d[(n_diag_blocks + 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) elif trans == "T" or trans == "C": @@ -302,10 +301,6 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) - #d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - - #B_previous_d[(n_diag_blocks - 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) - d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: @@ -352,10 +347,10 @@ def _pobts_streaming( d2h_events[i % 2].record(stream=d2h_stream) - if n_diag_blocks > 0: - d2h_stream.wait_event(compute_B_events[0]) + + d2h_stream.wait_event(compute_B_events[0]) - B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") From 83a681cf35c16699e9a1e6067657b81392e6413c Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:04:46 +0000 Subject: [PATCH 190/242] attempt to fuirther reduce streaming --- src/serinv/algs/pobtas.py | 17 ++++++++--------- src/serinv/algs/pobts.py | 4 ---- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 0744e7df..16a54ee7 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -529,12 +529,11 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - B_previous_d[(n_diag_blocks - 1) % 2].get( - out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], - stream=d2h_stream, - blocking=False - - ) + #B_previous_d[(n_diag_blocks - 1) % 2].get( + # out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], + # stream=d2h_stream, + # blocking=False + #) d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) @@ -589,10 +588,10 @@ def _pobtas_streaming( ) d2h_events[i % 2].record(stream=d2h_stream) - if n_diag_blocks > 1: - d2h_stream.wait_event(compute_B_events[0]) + #if n_diag_blocks > 1: + d2h_stream.wait_event(compute_B_events[0]) - B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index dd3459b2..2f216173 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -221,10 +221,6 @@ def _pobts_streaming( ) compute_B_events[0].record(stream=compute_stream) - - #d2h_stream.wait_event(compute_B_events[0]) - - #B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False,) d2h_events[0].record(stream=d2h_stream) From a685906a1e3c1a451b9f27d8dc2bd235bfe826c2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:09:31 +0000 Subject: [PATCH 191/242] speed up setup attempt --- src/serinv/algs/pobtas.py | 7 +------ src/serinv/algs/pobts.py | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 16a54ee7..2e69178d 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -529,11 +529,7 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - #B_previous_d[(n_diag_blocks - 1) % 2].get( - # out=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], - # stream=d2h_stream, - # blocking=False - #) + d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) @@ -588,7 +584,6 @@ def _pobtas_streaming( ) d2h_events[i % 2].record(stream=d2h_stream) - #if n_diag_blocks > 1: d2h_stream.wait_event(compute_B_events[0]) B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 2f216173..3278f94f 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -297,10 +297,10 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) - d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) + #d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: - h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) + #h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) B_d[n_diag_blocks % 2].set( arr=B[-(2 * diag_blocksize) : -diag_blocksize], From 37118fd9a319858caf0e6a0d6ea694f5a59366fa Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:11:06 +0000 Subject: [PATCH 192/242] expand delay reduction --- src/serinv/algs/pobtas.py | 4 ++-- src/serinv/algs/pobts.py | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 2e69178d..6798e6f4 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -530,11 +530,11 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) + #d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) if n_diag_blocks > 1: - h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) + #h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) B_d[n_diag_blocks % 2].set( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 3278f94f..9b07c435 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -221,11 +221,8 @@ def _pobts_streaming( ) compute_B_events[0].record(stream=compute_stream) - - d2h_events[0].record(stream=d2h_stream) if n_diag_blocks > 1: - h2d_stream.wait_event(d2h_events[0]) B_d[1].set( arr=B[diag_blocksize : (2 * diag_blocksize)], @@ -297,10 +294,7 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) - #d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) - if n_diag_blocks > 1: - #h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) B_d[n_diag_blocks % 2].set( arr=B[-(2 * diag_blocksize) : -diag_blocksize], From 73996a28263b1a32199e185e14bedae1652d57e5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:35:09 +0000 Subject: [PATCH 193/242] comment changes --- src/serinv/algs/pobtas.py | 18 ++++++------------ src/serinv/algs/pobts.py | 7 +++++-- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 6798e6f4..a457528c 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -296,7 +296,8 @@ def _pobtas_streaming( compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] - # Forward Pass + # --- Forward substitution --- + # --- C: events + transfers --- compute_current_B_events[1].record(stream=compute_stream) compute_next_B_events[1].record(stream=compute_stream) @@ -325,8 +326,7 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_lower_diagonal_events[0].record(stream=h2d_stream) - - # --- Forward substitution --- + # --- Computations --- for i in range(0, n_diag_blocks - 1): if i + 1 < n_diag_blocks: @@ -487,7 +487,6 @@ def _pobtas_streaming( # Forward Pass # --- C: events + transfers --- - B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) B_d[(n_diag_blocks - 1) % 2].set( @@ -499,9 +498,6 @@ def _pobtas_streaming( h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) - - - # ----- Backward substitution ----- if not partial: # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) @@ -530,11 +526,8 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - #d2h_events[(n_diag_blocks - 1) % 2].record(stream=d2h_stream) - if n_diag_blocks > 1: - #h2d_stream.wait_event(d2h_events[(n_diag_blocks - 1) % 2]) B_d[n_diag_blocks % 2].set( arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], @@ -547,7 +540,7 @@ def _pobtas_streaming( h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + if i > 0: h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) @@ -561,7 +554,8 @@ def _pobtas_streaming( with compute_stream: compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) - + + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 9b07c435..0420841c 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -222,6 +222,7 @@ def _pobts_streaming( compute_B_events[0].record(stream=compute_stream) + if n_diag_blocks > 1: B_d[1].set( @@ -234,7 +235,7 @@ def _pobts_streaming( h2d_events[0].record(stream=h2d_stream) for i in range(1, n_diag_blocks): - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + if i + 1 < n_diag_blocks: h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) @@ -249,6 +250,7 @@ def _pobts_streaming( compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -306,7 +308,7 @@ def _pobts_streaming( h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} + if i > 0: h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) @@ -320,6 +322,7 @@ def _pobts_streaming( compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] From 9d3dda0ebddb6df88cc9b71609945b83c33bd5e9 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:42:26 +0000 Subject: [PATCH 194/242] check for useless if --- src/serinv/algs/pobtas.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index a457528c..355aeae2 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -277,6 +277,7 @@ def _pobtas_streaming( L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) if trans == "N": + # --- Forward substitution --- # delete helper variable del B_shape @@ -296,8 +297,6 @@ def _pobtas_streaming( compute_partial_events = [cp.cuda.Event(), cp.cuda.Event()] - # --- Forward substitution --- - # --- C: events + transfers --- compute_current_B_events[1].record(stream=compute_stream) compute_next_B_events[1].record(stream=compute_stream) @@ -329,16 +328,15 @@ def _pobtas_streaming( # --- Computations --- for i in range(0, n_diag_blocks - 1): - if i + 1 < n_diag_blocks: - # stream next B block - h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) - - B_d[(i + 1) % 2].set( - arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], - stream = h2d_stream - ) + #if i + 1 < n_diag_blocks: + # stream next B block + h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2].set( + arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream = h2d_stream + ) - h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) if i + 1 < n_diag_blocks - 1: # stream next diagonal block From c2427fe4df9f7c9c5f0aaff2c52275a5fa20f571 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:47:01 +0000 Subject: [PATCH 195/242] check for duplicate --- src/serinv/algs/pobtas.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 355aeae2..2056fc39 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -327,10 +327,9 @@ def _pobtas_streaming( # --- Computations --- for i in range(0, n_diag_blocks - 1): - - #if i + 1 < n_diag_blocks: - # stream next B block + # pass next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) + B_d[(i + 1) % 2].set( arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream = h2d_stream @@ -339,7 +338,7 @@ def _pobtas_streaming( h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) if i + 1 < n_diag_blocks - 1: - # stream next diagonal block + # pass next diagonal block h2d_stream.wait_event(compute_current_B_events[(i + 1) % 2]) L_diagonal_blocks_d[(i + 1) % 2].set( @@ -351,7 +350,7 @@ def _pobtas_streaming( with compute_stream: - # Compute step 1 : compute B + # Solve current B compute_stream.wait_event(h2d_diagonal_events[i % 2]) B_d[i % 2] = cu_la.solve_triangular( @@ -362,7 +361,7 @@ def _pobtas_streaming( compute_current_B_events[i % 2].record(stream=compute_stream) - # stream B back + # Pass current B back d2h_stream.wait_event(compute_current_B_events[i % 2]) B_d[i % 2].get( @@ -374,7 +373,7 @@ def _pobtas_streaming( d2h_B_events[i % 2].record(stream=d2h_stream) if i + 1 < n_diag_blocks - 1: - # stream next lower diagonal block + # Pass next lower diagonal block h2d_stream.wait_event(compute_next_B_events[(i + 1) % 2]) L_lower_diagonal_blocks_d[(i + 1) % 2].set( @@ -385,7 +384,7 @@ def _pobtas_streaming( h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: - # Compute step 2 : update next B + # Update next B compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) B_d[(i + 1) % 2] -= ( @@ -396,7 +395,7 @@ def _pobtas_streaming( compute_next_B_events[i % 2].record(stream=compute_stream) if i + 1 < n_diag_blocks - 1: - # stream next lower arrow block + # Pass next lower arrow block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) L_lower_arrow_blocks_d[(i + 1) % 2].set( @@ -407,7 +406,7 @@ def _pobtas_streaming( h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: - # Compute step 3 : update arrow tip + # Update arrow tip compute_stream.wait_event(h2d_arrow_events[i % 2]) B_arrow_tip_d -= ( @@ -417,11 +416,12 @@ def _pobtas_streaming( compute_arrow_B_events[i % 2].record(stream=compute_stream) - d2h_stream.wait_event(compute_arrow_B_events[i % 2]) + # Pass arrow tip back + d2h_stream.wait_event(compute_arrow_B_events[i % 2]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - d2h_tip_events[i % 2].record(stream=d2h_stream) + d2h_tip_events[i % 2].record(stream=d2h_stream) if not partial: From 2f159bfb31a33a92f65cbb6852cef4ca7c6566d4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 16:48:11 +0000 Subject: [PATCH 196/242] reverted --- src/serinv/algs/pobtas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 2056fc39..d85e140b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -417,11 +417,11 @@ def _pobtas_streaming( compute_arrow_B_events[i % 2].record(stream=compute_stream) # Pass arrow tip back - d2h_stream.wait_event(compute_arrow_B_events[i % 2]) + d2h_stream.wait_event(compute_arrow_B_events[i % 2]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - d2h_tip_events[i % 2].record(stream=d2h_stream) + d2h_tip_events[i % 2].record(stream=d2h_stream) if not partial: From d710e38cc73e20af98c297ebe754a0d16682e827 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 17:29:33 +0000 Subject: [PATCH 197/242] reduced for loop --- src/serinv/algs/pobtas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index d85e140b..53311f06 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -417,11 +417,11 @@ def _pobtas_streaming( compute_arrow_B_events[i % 2].record(stream=compute_stream) # Pass arrow tip back - d2h_stream.wait_event(compute_arrow_B_events[i % 2]) + d2h_stream.wait_event(compute_arrow_B_events[n_diag_blocks % 2]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - d2h_tip_events[i % 2].record(stream=d2h_stream) + d2h_tip_events[n_diag_blocks % 2].record(stream=d2h_stream) if not partial: From e3dc9d36ef93aa9c946fff57b9012c91668ac614 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 18:38:07 +0000 Subject: [PATCH 198/242] reordered streaming --- src/serinv/algs/pobtas.py | 39 +++++++++++++++++++++++++-------------- src/serinv/algs/pobts.py | 27 +++++++++++++++++---------- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 53311f06..f5f52976 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -277,9 +277,9 @@ def _pobtas_streaming( L_arrow_tip_block_d = cp.empty_like(L_arrow_tip_block) if trans == "N": - # --- Forward substitution --- + # ----- Forward substitution ----- - # delete helper variable + # Delete helper variable del B_shape # Events @@ -350,7 +350,7 @@ def _pobtas_streaming( with compute_stream: - # Solve current B + # Solve current B block compute_stream.wait_event(h2d_diagonal_events[i % 2]) B_d[i % 2] = cu_la.solve_triangular( @@ -361,7 +361,7 @@ def _pobtas_streaming( compute_current_B_events[i % 2].record(stream=compute_stream) - # Pass current B back + # Pass current B block back d2h_stream.wait_event(compute_current_B_events[i % 2]) B_d[i % 2].get( @@ -384,7 +384,7 @@ def _pobtas_streaming( h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: - # Update next B + # Update next B block compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) B_d[(i + 1) % 2] -= ( @@ -416,7 +416,7 @@ def _pobtas_streaming( compute_arrow_B_events[i % 2].record(stream=compute_stream) - # Pass arrow tip back + # Pass arrow tip back d2h_stream.wait_event(compute_arrow_B_events[n_diag_blocks % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) @@ -425,9 +425,7 @@ def _pobtas_streaming( if not partial: - # In the case of the partial solve, we do not solve the last block and - # arrow tip block of the RHS. - + # Pass last blocks h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) @@ -440,15 +438,18 @@ def _pobtas_streaming( with compute_stream: - + # Solve last B block compute_stream.wait_event(h2d_diagonal_events[(n_diag_blocks - 1) % 2]) + B_d[(n_diag_blocks - 1) % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], B_d[(n_diag_blocks - 1) % 2], lower=True ) + compute_partial_events[0].record(stream=compute_stream) + # Pass last B block back d2h_stream.wait_event(compute_partial_events[0]) B_d[(n_diag_blocks - 1) % 2].get( @@ -460,6 +461,7 @@ def _pobtas_streaming( d2h_B_events[0].record(stream=d2h_stream) with compute_stream: + # Solve arrow tip compute_stream.wait_event(h2d_arrow_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d -= (L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2] @ B_d[(n_diag_blocks - 1) % 2]) @@ -472,10 +474,14 @@ def _pobtas_streaming( B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) elif trans == "T" or trans == "C": + # ----- Backward substitution ----- + # Buffers B_previous_d = cp.empty( (2, *B_shape.shape), dtype=B_shape.dtype ) + + # Delete helper variable del B_shape # Events @@ -483,8 +489,7 @@ def _pobtas_streaming( h2d_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_events = [cp.cuda.Event(), cp.cuda.Event()] - # Forward Pass - # --- C: events + transfers --- + # --- H2D: transfers --- B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) B_d[(n_diag_blocks - 1) % 2].set( @@ -498,8 +503,9 @@ def _pobtas_streaming( # ----- Backward substitution ----- if not partial: - # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) + with compute_stream: + # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d = cu_la.solve_triangular( L_arrow_tip_block_d, @@ -508,6 +514,7 @@ def _pobtas_streaming( trans="C", ) + # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) B_previous_d[(n_diag_blocks - 1) % 2] = ( cu_la.solve_triangular( L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], @@ -520,6 +527,7 @@ def _pobtas_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) + # Pass arrow tip back d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) @@ -540,6 +548,7 @@ def _pobtas_streaming( for i in range(n_diag_blocks - 2, -1, -1): if i > 0: + # Pass new blocks h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) @@ -550,10 +559,10 @@ def _pobtas_streaming( h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -566,6 +575,7 @@ def _pobtas_streaming( compute_B_events[i % 2].record(compute_stream) + # Pass previous B block back d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) B_previous_d[(i - 1) % 2].get( @@ -576,6 +586,7 @@ def _pobtas_streaming( ) d2h_events[i % 2].record(stream=d2h_stream) + # Pass last B block back d2h_stream.wait_event(compute_B_events[0]) B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 0420841c..a532f892 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -204,12 +204,27 @@ def _pobts_streaming( d2h_events = [cp.cuda.Event(), cp.cuda.Event()] if trans == "N": + # ----- Forward substitution ----- + + # --- H2D: transfers --- B_d[0].set(arr=B[:diag_blocksize], stream=h2d_stream) L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) h2d_events[1].record(stream=h2d_stream) + if n_diag_blocks > 1: + + B_d[1].set( + arr=B[diag_blocksize : (2 * diag_blocksize)], + stream=h2d_stream + ) + L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) + L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + + h2d_events[0].record(stream=h2d_stream) + with compute_stream: + # Solve first B block compute_stream.wait_event(h2d_events[1]) B_previous_d[0] = ( @@ -223,21 +238,13 @@ def _pobts_streaming( compute_B_events[0].record(stream=compute_stream) - if n_diag_blocks > 1: - - B_d[1].set( - arr=B[diag_blocksize : (2 * diag_blocksize)], - stream=h2d_stream - ) - L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) - - h2d_events[0].record(stream=h2d_stream) + for i in range(1, n_diag_blocks): if i + 1 < n_diag_blocks: + # Pass next blocks h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) From b8871ccb81e340989d2cd4fc54c6b5884f58a364 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 1 May 2025 18:44:35 +0000 Subject: [PATCH 199/242] moved streaming and added documentation --- src/serinv/algs/pobts.py | 41 ++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index a532f892..295be756 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -213,7 +213,6 @@ def _pobts_streaming( h2d_events[1].record(stream=h2d_stream) if n_diag_blocks > 1: - B_d[1].set( arr=B[diag_blocksize : (2 * diag_blocksize)], stream=h2d_stream @@ -237,12 +236,8 @@ def _pobts_streaming( compute_B_events[0].record(stream=compute_stream) - - - for i in range(1, n_diag_blocks): - if i + 1 < n_diag_blocks: # Pass next blocks h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) @@ -254,10 +249,10 @@ def _pobts_streaming( h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} compute_stream.wait_event(h2d_events[(i + 1) % 2]) compute_stream.wait_event(d2h_events[(i + 1) % 2]) - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -268,6 +263,7 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) + # Pass previous B block back d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) B_previous_d[(i + 1) % 2].get( @@ -278,18 +274,34 @@ def _pobts_streaming( d2h_events[i % 2].record(stream=d2h_stream) + # Pass last B block back d2h_stream.wait_event(compute_B_events[(n_diag_blocks + 1) % 2]) B_previous_d[(n_diag_blocks + 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) elif trans == "T" or trans == "C": + # ----- Backward substitution ----- + + # --- H2D: transfers --- B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) + if n_diag_blocks > 1: + + B_d[n_diag_blocks % 2].set( + arr=B[-(2 * diag_blocksize) : -diag_blocksize], + stream=h2d_stream + ) + L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) + L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) + + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) + with compute_stream: + # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) B_previous_d[(n_diag_blocks - 1) % 2] = ( @@ -303,20 +315,12 @@ def _pobts_streaming( compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) - if n_diag_blocks > 1: - - B_d[n_diag_blocks % 2].set( - arr=B[-(2 * diag_blocksize) : -diag_blocksize], - stream=h2d_stream - ) - L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) - L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - - h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) + for i in range(n_diag_blocks - 2, -1, -1): if i > 0: + # pass next blocks h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) @@ -326,10 +330,10 @@ def _pobts_streaming( h2d_events[i % 2].record(stream=h2d_stream) with compute_stream: + # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) - # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -341,13 +345,14 @@ def _pobts_streaming( compute_B_events[i % 2].record(compute_stream) + # Pass previous B block back d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) d2h_events[i % 2].record(stream=d2h_stream) - + # Pass last B block back d2h_stream.wait_event(compute_B_events[0]) B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) From 69c20abb211c9d1ef9be20d4166fb41b94dc07bd Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 2 May 2025 12:59:42 +0000 Subject: [PATCH 200/242] bigger tests --- tests/tests_algs/regular/conftest.py | 3 +++ tests/tests_algs/regular/tests_bt/test_pobts.py | 2 +- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/tests_algs/regular/conftest.py b/tests/tests_algs/regular/conftest.py index 239baed2..db19b997 100644 --- a/tests/tests_algs/regular/conftest.py +++ b/tests/tests_algs/regular/conftest.py @@ -9,6 +9,9 @@ pytest.param(2, id="n_diag_blocks=2"), pytest.param(3, id="n_diag_blocks=3"), pytest.param(4, id="n_diag_blocks=4"), + pytest.param(4, id="n_diag_blocks=125"), + pytest.param(4, id="n_diag_blocks=500"), + pytest.param(4, id="n_diag_blocks=1000"), ] diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index f5c941dc..b8fb5ad3 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -12,7 +12,7 @@ import cupyx as cpx @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000]) def test_pobts( n_rhs: int, diagonal_blocksize: int, diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index e9ce2384..1c4a6c22 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From f2570605711b8f1d343652ffd497c2098a63852b Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 2 May 2025 13:02:18 +0000 Subject: [PATCH 201/242] even bigger tests --- tests/tests_algs/regular/tests_bt/test_pobts.py | 2 +- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index b8fb5ad3..58168ab3 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -12,7 +12,7 @@ import cupyx as cpx @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000, 8000, 16000]) def test_pobts( n_rhs: int, diagonal_blocksize: int, diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 1c4a6c22..763a2679 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000, 8000, 16000]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From 0416d8abba80ad7b742e96c7ec0b4fd8b3bdc992 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 2 May 2025 13:04:59 +0000 Subject: [PATCH 202/242] even more bigger tests --- tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 3e624933..f8ec3b56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,9 @@ DIAGONAL_BLOCKSIZE = [ pytest.param(2, id="diagonal_blocksize=2"), pytest.param(3, id="diagonal_blocksize=3"), + pytest.param(500, id="diagonal_blocksize=500"), + pytest.param(500, id="diagonal_blocksize=1000"), + pytest.param(500, id="diagonal_blocksize=4000"), ] From b00d95b8e39b752872bf26d9fb4ddd80ca19b29b Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 2 May 2025 13:18:54 +0000 Subject: [PATCH 203/242] changed tests to be smaller --- tests/conftest.py | 2 -- tests/tests_algs/regular/conftest.py | 4 +--- tests/tests_algs/regular/tests_bt/test_pobts.py | 2 +- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f8ec3b56..ac1a938a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,8 +25,6 @@ pytest.param(2, id="diagonal_blocksize=2"), pytest.param(3, id="diagonal_blocksize=3"), pytest.param(500, id="diagonal_blocksize=500"), - pytest.param(500, id="diagonal_blocksize=1000"), - pytest.param(500, id="diagonal_blocksize=4000"), ] diff --git a/tests/tests_algs/regular/conftest.py b/tests/tests_algs/regular/conftest.py index db19b997..0d22276e 100644 --- a/tests/tests_algs/regular/conftest.py +++ b/tests/tests_algs/regular/conftest.py @@ -9,9 +9,7 @@ pytest.param(2, id="n_diag_blocks=2"), pytest.param(3, id="n_diag_blocks=3"), pytest.param(4, id="n_diag_blocks=4"), - pytest.param(4, id="n_diag_blocks=125"), - pytest.param(4, id="n_diag_blocks=500"), - pytest.param(4, id="n_diag_blocks=1000"), + pytest.param(125, id="n_diag_blocks=125"), ] diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index 58168ab3..0cff67fe 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -12,7 +12,7 @@ import cupyx as cpx @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000, 8000, 16000]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000]) def test_pobts( n_rhs: int, diagonal_blocksize: int, diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 763a2679..288af233 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000, 4000, 8000, 16000]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From 6dc83e6d58e3aaae7ba5ad12505b9be1cc2e9492 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 2 May 2025 13:21:29 +0000 Subject: [PATCH 204/242] smaller tests again --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index ac1a938a..5f6d4827 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,7 +24,7 @@ DIAGONAL_BLOCKSIZE = [ pytest.param(2, id="diagonal_blocksize=2"), pytest.param(3, id="diagonal_blocksize=3"), - pytest.param(500, id="diagonal_blocksize=500"), + pytest.param(20, id="diagonal_blocksize=20"), ] From 2645a8fa5484142cde799953d284c6e4b7d5d2d4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 2 May 2025 13:23:25 +0000 Subject: [PATCH 205/242] reset tests --- tests/conftest.py | 1 - tests/tests_algs/regular/conftest.py | 1 - tests/tests_algs/regular/tests_bt/test_pobts.py | 2 +- tests/tests_algs/regular/tests_bta/test_pobtas.py | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5f6d4827..3e624933 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,7 +24,6 @@ DIAGONAL_BLOCKSIZE = [ pytest.param(2, id="diagonal_blocksize=2"), pytest.param(3, id="diagonal_blocksize=3"), - pytest.param(20, id="diagonal_blocksize=20"), ] diff --git a/tests/tests_algs/regular/conftest.py b/tests/tests_algs/regular/conftest.py index 0d22276e..239baed2 100644 --- a/tests/tests_algs/regular/conftest.py +++ b/tests/tests_algs/regular/conftest.py @@ -9,7 +9,6 @@ pytest.param(2, id="n_diag_blocks=2"), pytest.param(3, id="n_diag_blocks=3"), pytest.param(4, id="n_diag_blocks=4"), - pytest.param(125, id="n_diag_blocks=125"), ] diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index 0cff67fe..f5c941dc 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -12,7 +12,7 @@ import cupyx as cpx @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobts( n_rhs: int, diagonal_blocksize: int, diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 288af233..e9ce2384 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -13,7 +13,7 @@ @pytest.mark.mpi_skip() -@pytest.mark.parametrize("n_rhs", [1, 2, 3, 500, 2000]) +@pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobtas( n_rhs: int, diagonal_blocksize: int, From 7329ec3ada7580fb1a599e13a4c7d74c2d1ecfd6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 8 May 2025 13:59:55 +0200 Subject: [PATCH 206/242] add scripts for cscs --- run_streamlined_sequential_pobtax_gpu.sh | 51 +++++ ...d_sequential_pobtax_gpu.sh:Zone.Identifier | 3 + .../streamlined_sequential_pobtax_gpu.py | 195 ++++++++++++++++++ ...d_sequential_pobtax_gpu.py:Zone.Identifier | 3 + 4 files changed, 252 insertions(+) create mode 100644 run_streamlined_sequential_pobtax_gpu.sh create mode 100644 sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier create mode 100644 sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py create mode 100644 sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier diff --git a/run_streamlined_sequential_pobtax_gpu.sh b/run_streamlined_sequential_pobtax_gpu.sh new file mode 100644 index 00000000..fe03f6bf --- /dev/null +++ b/run_streamlined_sequential_pobtax_gpu.sh @@ -0,0 +1,51 @@ +#!/bin/bash -l +#SBATCH --job-name="serinv_pobtx_benchmark" +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err +#SBATCH --account=lp82 +#SBATCH --time=00:10:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=64 +#SBATCH --gpus-per-task=1 +#SBATCH --partition=debug +#SBATCH --constraint=gpu +#SBATCH --hint=nomultithread +#SBATCH --uenv=prgenv-gnu/24.11:v1 +#SBATCH --view=modules + +set -e -u + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export MPICH_GPU_SUPPORT_ENABLED=1 +export OMP_PLACES=cores +export OMP_PROC_BIND=close + +export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID + +source ~/load_modules.sh +conda activate serinv_env + +# Dataset 1: b = 1675, a = 6, n = 128 +# Reference timings (to beat!): +# - pobtaf: 0.38959 +# - pobtas: 0.02415 +# - pobtasi: 0.29593 +# export b=1675 +# export a=6 +# export n=128 + +# Dataset 1: b = 4002, a = 6, n = 250 +# Reference timings (to beat!): +# - pobtaf: 3.2716 (INLA_BTA CUDA code: 2.713) +# - pobtas: 0.15397 +# - pobtasi: 5.15729 +export b=4002 +export a=6 +export n=250 + +# Benchmark the code +srun python ~/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n + +# Profile the code +# srun nsys profile --force-overwrite=true -o profile_serinv_pobtax_b${b}_a${a}_n${n} python ~/repositories/serinv/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n --b $b --a $a --n $n \ No newline at end of file diff --git a/sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier b/sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier new file mode 100644 index 00000000..33e02d64 --- /dev/null +++ b/sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://iis-mattermost.ee.ethz.ch/api/v4/files/waiggpk1miyeb84dcahdh53b1e?download=1 diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py new file mode 100644 index 00000000..88f549cd --- /dev/null +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -0,0 +1,195 @@ +import time + +tic = time.perf_counter() +import argparse + +import numpy as np +import cupy as cp +from cupy.cuda.nvtx import RangePush, RangePop + +from serinv.algs import pobtaf, pobtas, pobtasi + + +def sequential_dataset( + n_blocks: int, + diagonal_blocksize: int, + arrowhead_blocksize: int, +): + A_diagonal_blocks = np.random.rand(n_blocks, diagonal_blocksize, diagonal_blocksize) + A_lower_diagonal_blocks = np.random.rand( + n_blocks - 1, diagonal_blocksize, diagonal_blocksize + ) + A_arrow_bottom_blocks = np.random.rand( + n_blocks, arrowhead_blocksize, diagonal_blocksize + ) + A_arrow_tip_block = np.random.rand(arrowhead_blocksize, arrowhead_blocksize) + + # CODE TO MODIFY + arrow_colsum = np.zeros((arrowhead_blocksize), dtype=A_diagonal_blocks.dtype) + for i in range(A_diagonal_blocks.shape[0]): + colsum = np.sum(A_diagonal_blocks[i, :, :], axis=1) - np.diag( + A_diagonal_blocks[i, :, :] + ) + if i > 0: + colsum += np.sum(A_lower_diagonal_blocks[i - 1, :, :], axis=1) + + A_diagonal_blocks[i, :, :] += np.diag(colsum) + + arrow_colsum[:] += np.sum(A_arrow_bottom_blocks[i, :, :], axis=1) + + A_arrow_tip_block[:, :] += np.diag( + arrow_colsum + np.sum(A_arrow_tip_block[:, :], axis=1) + ) + + return ( + A_diagonal_blocks, + A_lower_diagonal_blocks, + A_arrow_bottom_blocks, + A_arrow_tip_block, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process some integers.") + parser.add_argument( + "--b", + type=int, + default=128, + help="an integer for the diagonal block size", + ) + parser.add_argument( + "--a", + type=int, + default=0, + help="an integer for the diagonal block size", + ) + parser.add_argument( + "--n", + type=int, + default=8, + help="an integer for the number of diagonal blocks", + ) + args = parser.parse_args() + toc = time.perf_counter() + print(f"Import and parsing took: {toc - tic:.5f} sec", flush=True) + + diagonal_blocksize = args.b + arrowhead_blocksize = args.a + n_blocks = args.n + n_iterations = 10 + n_warmups = 2 + + tic = time.perf_counter() + ( + A_diagonal_blocks_cpu, + A_lower_diagonal_blocks_cpu, + A_arrow_bottom_blocks_cpu, + A_arrow_tip_block_cpu, + ) = sequential_dataset( + n_blocks, + diagonal_blocksize, + arrowhead_blocksize, + ) + B_cpu = np.random.rand(diagonal_blocksize * n_blocks + arrowhead_blocksize, 1) + toc = time.perf_counter() + print(f"Generate dataset took: {toc - tic:.5f} sec", flush=True) + print(f" b = {diagonal_blocksize}", flush=True) + print(f" a = {arrowhead_blocksize}", flush=True) + print(f" n = {n_blocks}", flush=True) + print(f" n_iterations = {n_iterations}", flush=True) + print(f" n_warmups = {n_warmups}", flush=True) + + total_memory = ( + A_diagonal_blocks_cpu.nbytes + + A_lower_diagonal_blocks_cpu.nbytes + + A_arrow_bottom_blocks_cpu.nbytes + + A_arrow_tip_block_cpu.nbytes + + B_cpu.nbytes + ) + print(f" Total memory: {total_memory / 1e9:.5f} GB", flush=True) + + tic = time.perf_counter() + # Init device arrays + A_diagonal_blocks_gpu = cp.empty_like(A_diagonal_blocks_cpu) + A_lower_diagonal_blocks_gpu = cp.empty_like(A_lower_diagonal_blocks_cpu) + A_arrow_bottom_blocks_gpu = cp.empty_like(A_arrow_bottom_blocks_cpu) + A_arrow_tip_block_gpu = cp.empty_like(A_arrow_tip_block_cpu) + B_gpu = cp.empty_like(B_cpu) + toc = time.perf_counter() + print(f"Init device arrays took: {toc - tic:.5f} sec", flush=True) + + t_pobtaf = [] + t_pobtas = [] + t_pobtasi = [] + + for i in range(n_warmups + n_iterations): + print(f"Iteration: {i+1}/{n_warmups+n_iterations}", flush=True) + + tic = time.perf_counter() + A_diagonal_blocks_gpu.set(arr=A_diagonal_blocks_cpu) + A_lower_diagonal_blocks_gpu.set(arr=A_lower_diagonal_blocks_cpu) + A_arrow_bottom_blocks_gpu.set(arr=A_arrow_bottom_blocks_cpu) + A_arrow_tip_block_gpu.set(arr=A_arrow_tip_block_cpu) + B_gpu.set(arr=B_cpu) + toc = time.perf_counter() + print(f"Copying data to GPU took: {toc - tic:.5f} sec", flush=True) + + cp.cuda.runtime.deviceSynchronize() + RangePush(f"pobtaf: i:{i}") + tic = time.perf_counter() + pobtaf( + A_diagonal_blocks_gpu, + A_lower_diagonal_blocks_gpu, + A_arrow_bottom_blocks_gpu, + A_arrow_tip_block_gpu, + ) + cp.cuda.runtime.deviceSynchronize() + toc = time.perf_counter() + RangePop() + elapsed = toc - tic + print(f"pobtaf took: {elapsed:.5f} sec", flush=True) + if i >= n_warmups: + t_pobtaf.append(elapsed) + + cp.cuda.runtime.deviceSynchronize() + RangePush(f"pobtas: i:{i}") + tic = time.perf_counter() + pobtas( + A_diagonal_blocks_gpu, + A_lower_diagonal_blocks_gpu, + A_arrow_bottom_blocks_gpu, + A_arrow_tip_block_gpu, + B_gpu, + ) + cp.cuda.runtime.deviceSynchronize() + toc = time.perf_counter() + RangePop() + elapsed = toc - tic + print(f"pobtas took: {elapsed:.5f} sec", flush=True) + if i >= n_warmups: + t_pobtas.append(elapsed) + + cp.cuda.runtime.deviceSynchronize() + RangePush(f"pobtasi: i:{i}") + tic = time.perf_counter() + pobtasi( + A_diagonal_blocks_gpu, + A_lower_diagonal_blocks_gpu, + A_arrow_bottom_blocks_gpu, + A_arrow_tip_block_gpu, + ) + cp.cuda.runtime.deviceSynchronize() + toc = time.perf_counter() + RangePop() + elapsed = toc - tic + print(f"pobtasi took: {elapsed:.5f} sec", flush=True) + if i >= n_warmups: + t_pobtasi.append(elapsed) + + print(f"t_pobtaf: {t_pobtaf}", flush=True) + print(f"t_pobtas: {t_pobtas}", flush=True) + print(f"t_pobtasi: {t_pobtasi}", flush=True) + + print(f"avg t_pobtaf: {np.mean(np.array(t_pobtaf)):.5f} sec", flush=True) + print(f"avg t_pobtas: {np.mean(np.array(t_pobtas)):.5f} sec", flush=True) + print(f"avg t_pobtasi: {np.mean(np.array(t_pobtasi)):.5f} sec", flush=True) \ No newline at end of file diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier new file mode 100644 index 00000000..ce8dec59 --- /dev/null +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://iis-mattermost.ee.ethz.ch/api/v4/files/fw5m5tapefbi8deseto5qqro9w?download=1 From 04cbd7779b160e6031ee0613df0c8812fbce2e34 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 8 May 2025 14:34:20 +0000 Subject: [PATCH 207/242] updarte bash script --- run_streamlined_sequential_pobtax_gpu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_streamlined_sequential_pobtax_gpu.sh b/run_streamlined_sequential_pobtax_gpu.sh index fe03f6bf..5d13342c 100644 --- a/run_streamlined_sequential_pobtax_gpu.sh +++ b/run_streamlined_sequential_pobtax_gpu.sh @@ -2,7 +2,7 @@ #SBATCH --job-name="serinv_pobtx_benchmark" #SBATCH --output=%x.%j.out #SBATCH --error=%x.%j.err -#SBATCH --account=lp82 +#SBATCH --account=lp16 #SBATCH --time=00:10:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 @@ -35,7 +35,7 @@ conda activate serinv_env # export a=6 # export n=128 -# Dataset 1: b = 4002, a = 6, n = 250 +# Dataset 2: b = 4002, a = 6, n = 250 # Reference timings (to beat!): # - pobtaf: 3.2716 (INLA_BTA CUDA code: 2.713) # - pobtas: 0.15397 From 6a85bc5987127889e341a69bd7488382b365c002 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 9 May 2025 07:36:33 +0000 Subject: [PATCH 208/242] removed load_modules --- run_streamlined_sequential_pobtax_gpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_streamlined_sequential_pobtax_gpu.sh b/run_streamlined_sequential_pobtax_gpu.sh index 5d13342c..33d81986 100644 --- a/run_streamlined_sequential_pobtax_gpu.sh +++ b/run_streamlined_sequential_pobtax_gpu.sh @@ -23,7 +23,7 @@ export OMP_PROC_BIND=close export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID -source ~/load_modules.sh +# source ~/load_modules.sh conda activate serinv_env # Dataset 1: b = 1675, a = 6, n = 128 From d10b43692633929fccd304b65d17e643413d50e4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 9 May 2025 08:19:11 +0000 Subject: [PATCH 209/242] changed file path --- run_streamlined_sequential_pobtax_gpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_streamlined_sequential_pobtax_gpu.sh b/run_streamlined_sequential_pobtax_gpu.sh index 33d81986..74b5d9db 100644 --- a/run_streamlined_sequential_pobtax_gpu.sh +++ b/run_streamlined_sequential_pobtax_gpu.sh @@ -45,7 +45,7 @@ export a=6 export n=250 # Benchmark the code -srun python ~/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n +srun python ~/serinv/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n # Profile the code # srun nsys profile --force-overwrite=true -o profile_serinv_pobtax_b${b}_a${a}_n${n} python ~/repositories/serinv/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n --b $b --a $a --n $n \ No newline at end of file From 1a64890a42104fdaba7abff1721a75b11ee54f3b Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 9 May 2025 08:42:06 +0000 Subject: [PATCH 210/242] change to enable streaming on daint --- sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 88f549cd..a6f6be10 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -160,6 +160,7 @@ def sequential_dataset( A_arrow_bottom_blocks_gpu, A_arrow_tip_block_gpu, B_gpu, + device_streaming=True ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() From d21064f4b58258aba42bc8f4e7ddbaa9c83f0f6b Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 07:44:49 +0000 Subject: [PATCH 211/242] added check message --- src/serinv/algs/pobtas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index f5f52976..a6a528f6 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -238,6 +238,8 @@ def _pobtas_streaming( raise NotImplementedError( "Host<->Device streaming only works when host-arrays are given." ) + + print("streaming") cp, cu_la = _get_module_from_str(module_str="cupy") From 86ce7c185ac035ef3f3ffd9e6e9a067788d8a57b Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 08:38:42 +0000 Subject: [PATCH 212/242] changed given arrays --- .../streamlined_sequential_pobtax_gpu.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index a6f6be10..9002afaf 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -138,10 +138,10 @@ def sequential_dataset( RangePush(f"pobtaf: i:{i}") tic = time.perf_counter() pobtaf( - A_diagonal_blocks_gpu, - A_lower_diagonal_blocks_gpu, - A_arrow_bottom_blocks_gpu, - A_arrow_tip_block_gpu, + A_diagonal_blocks_cpu, + A_lower_diagonal_blocks_cpu, + A_arrow_bottom_blocks_cpu, + A_arrow_tip_block_cpu, ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() @@ -155,11 +155,11 @@ def sequential_dataset( RangePush(f"pobtas: i:{i}") tic = time.perf_counter() pobtas( - A_diagonal_blocks_gpu, - A_lower_diagonal_blocks_gpu, - A_arrow_bottom_blocks_gpu, - A_arrow_tip_block_gpu, - B_gpu, + A_diagonal_blocks_cpu, + A_lower_diagonal_blocks_cpu, + A_arrow_bottom_blocks_cpu, + A_arrow_tip_block_cpu, + B_cpu, device_streaming=True ) cp.cuda.runtime.deviceSynchronize() @@ -174,10 +174,10 @@ def sequential_dataset( RangePush(f"pobtasi: i:{i}") tic = time.perf_counter() pobtasi( - A_diagonal_blocks_gpu, - A_lower_diagonal_blocks_gpu, - A_arrow_bottom_blocks_gpu, - A_arrow_tip_block_gpu, + A_diagonal_blocks_cpu, + A_lower_diagonal_blocks_cpu, + A_arrow_bottom_blocks_cpu, + A_arrow_tip_block_cpu, ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() From 942a1465c33d9470f4a4a4539583f89f3f73ea18 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 09:00:23 +0000 Subject: [PATCH 213/242] rolled back block choice for further testing --- .../streamlined_sequential_pobtax_gpu.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 9002afaf..8d575c17 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -138,10 +138,11 @@ def sequential_dataset( RangePush(f"pobtaf: i:{i}") tic = time.perf_counter() pobtaf( - A_diagonal_blocks_cpu, - A_lower_diagonal_blocks_cpu, - A_arrow_bottom_blocks_cpu, - A_arrow_tip_block_cpu, + A_diagonal_blocks_gpu, + A_lower_diagonal_blocks_gpu, + A_arrow_bottom_blocks_gpu, + A_arrow_tip_block_gpu, + device_streaming=True ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() @@ -155,11 +156,11 @@ def sequential_dataset( RangePush(f"pobtas: i:{i}") tic = time.perf_counter() pobtas( - A_diagonal_blocks_cpu, - A_lower_diagonal_blocks_cpu, - A_arrow_bottom_blocks_cpu, - A_arrow_tip_block_cpu, - B_cpu, + A_diagonal_blocks_gpu, + A_lower_diagonal_blocks_gpu, + A_arrow_bottom_blocks_gpu, + A_arrow_tip_block_gpu, + B_gpu, device_streaming=True ) cp.cuda.runtime.deviceSynchronize() @@ -174,10 +175,10 @@ def sequential_dataset( RangePush(f"pobtasi: i:{i}") tic = time.perf_counter() pobtasi( - A_diagonal_blocks_cpu, - A_lower_diagonal_blocks_cpu, - A_arrow_bottom_blocks_cpu, - A_arrow_tip_block_cpu, + A_diagonal_blocks_gpu, + A_lower_diagonal_blocks_gpu, + A_arrow_bottom_blocks_gpu, + A_arrow_tip_block_gpu, ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() From dcc85fb0a85111472bdbd08aa0d36944fa8792b5 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 09:06:46 +0000 Subject: [PATCH 214/242] attempt to activate streaming --- .../streamlined_sequential_pobtax_gpu.py | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 8d575c17..98206bd8 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -142,7 +142,7 @@ def sequential_dataset( A_lower_diagonal_blocks_gpu, A_arrow_bottom_blocks_gpu, A_arrow_tip_block_gpu, - device_streaming=True + # device_streaming=True ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() @@ -152,15 +152,24 @@ def sequential_dataset( if i >= n_warmups: t_pobtaf.append(elapsed) + tic = time.perf_counter() + A_diagonal_blocks_gpu.get(arr=A_diagonal_blocks_cpu) + A_lower_diagonal_blocks_gpu.get(arr=A_lower_diagonal_blocks_cpu) + A_arrow_bottom_blocks_gpu.get(arr=A_arrow_bottom_blocks_cpu) + A_arrow_tip_block_gpu.get(arr=A_arrow_tip_block_cpu) + B_gpu.get(arr=B_cpu) + toc = time.perf_counter() + print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) + cp.cuda.runtime.deviceSynchronize() RangePush(f"pobtas: i:{i}") tic = time.perf_counter() pobtas( - A_diagonal_blocks_gpu, - A_lower_diagonal_blocks_gpu, - A_arrow_bottom_blocks_gpu, - A_arrow_tip_block_gpu, - B_gpu, + A_diagonal_blocks_cpu, + A_lower_diagonal_blocks_cpu, + A_arrow_bottom_blocks_cpu, + A_arrow_tip_block_cpu, + B_cpu, device_streaming=True ) cp.cuda.runtime.deviceSynchronize() @@ -171,6 +180,15 @@ def sequential_dataset( if i >= n_warmups: t_pobtas.append(elapsed) + tic = time.perf_counter() + A_diagonal_blocks_gpu.set(arr=A_diagonal_blocks_cpu) + A_lower_diagonal_blocks_gpu.set(arr=A_lower_diagonal_blocks_cpu) + A_arrow_bottom_blocks_gpu.set(arr=A_arrow_bottom_blocks_cpu) + A_arrow_tip_block_gpu.set(arr=A_arrow_tip_block_cpu) + B_gpu.set(arr=B_cpu) + toc = time.perf_counter() + print(f"Copying data to GPU took: {toc - tic:.5f} sec", flush=True) + cp.cuda.runtime.deviceSynchronize() RangePush(f"pobtasi: i:{i}") tic = time.perf_counter() From a73b7b900dfeaad694041a7a62be745366bebeb3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 09:09:32 +0000 Subject: [PATCH 215/242] typo --- .../streamlined_sequential_pobtax_gpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 98206bd8..c758bf0f 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -153,10 +153,10 @@ def sequential_dataset( t_pobtaf.append(elapsed) tic = time.perf_counter() - A_diagonal_blocks_gpu.get(arr=A_diagonal_blocks_cpu) - A_lower_diagonal_blocks_gpu.get(arr=A_lower_diagonal_blocks_cpu) - A_arrow_bottom_blocks_gpu.get(arr=A_arrow_bottom_blocks_cpu) - A_arrow_tip_block_gpu.get(arr=A_arrow_tip_block_cpu) + A_diagonal_blocks_gpu.get(out=A_diagonal_blocks_cpu) + A_lower_diagonal_blocks_gpu.get(out=A_lower_diagonal_blocks_cpu) + A_arrow_bottom_blocks_gpu.get(out=A_arrow_bottom_blocks_cpu) + A_arrow_tip_block_gpu.get(out=A_arrow_tip_block_cpu) B_gpu.get(arr=B_cpu) toc = time.perf_counter() print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) From 408628e717420f35090199cf45cbe5cf05bc8445 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 09:32:12 +0000 Subject: [PATCH 216/242] another typo --- .../positive_definite/streamlined_sequential_pobtax_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index c758bf0f..7e65e662 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -157,7 +157,7 @@ def sequential_dataset( A_lower_diagonal_blocks_gpu.get(out=A_lower_diagonal_blocks_cpu) A_arrow_bottom_blocks_gpu.get(out=A_arrow_bottom_blocks_cpu) A_arrow_tip_block_gpu.get(out=A_arrow_tip_block_cpu) - B_gpu.get(arr=B_cpu) + B_gpu.get(out=B_cpu) toc = time.perf_counter() print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) From 060fd0b1c6aa7c06e28b94e7ae8504f48207dec7 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 09:53:12 +0000 Subject: [PATCH 217/242] enable streaming for pobtaf --- .../streamlined_sequential_pobtax_gpu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 7e65e662..a35d6a2c 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -138,11 +138,11 @@ def sequential_dataset( RangePush(f"pobtaf: i:{i}") tic = time.perf_counter() pobtaf( - A_diagonal_blocks_gpu, - A_lower_diagonal_blocks_gpu, - A_arrow_bottom_blocks_gpu, - A_arrow_tip_block_gpu, - # device_streaming=True + A_diagonal_blocks_cpu, + A_lower_diagonal_blocks_cpu, + A_arrow_bottom_blocks_cpu, + A_arrow_tip_block_cpu, + device_streaming=True ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() From d08b7b3924811db11e432cc7209499efd4c740fc Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 09:54:05 +0000 Subject: [PATCH 218/242] removing copy --- .../streamlined_sequential_pobtax_gpu.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index a35d6a2c..1b879ed3 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -152,14 +152,14 @@ def sequential_dataset( if i >= n_warmups: t_pobtaf.append(elapsed) - tic = time.perf_counter() - A_diagonal_blocks_gpu.get(out=A_diagonal_blocks_cpu) - A_lower_diagonal_blocks_gpu.get(out=A_lower_diagonal_blocks_cpu) - A_arrow_bottom_blocks_gpu.get(out=A_arrow_bottom_blocks_cpu) - A_arrow_tip_block_gpu.get(out=A_arrow_tip_block_cpu) - B_gpu.get(out=B_cpu) - toc = time.perf_counter() - print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) + #tic = time.perf_counter() + #A_diagonal_blocks_gpu.get(out=A_diagonal_blocks_cpu) + #A_lower_diagonal_blocks_gpu.get(out=A_lower_diagonal_blocks_cpu) + #A_arrow_bottom_blocks_gpu.get(out=A_arrow_bottom_blocks_cpu) + #A_arrow_tip_block_gpu.get(out=A_arrow_tip_block_cpu) + #B_gpu.get(out=B_cpu) + #toc = time.perf_counter() + #print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) cp.cuda.runtime.deviceSynchronize() RangePush(f"pobtas: i:{i}") From ac4779950ad5886bce58f77174d252800994b1e6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 11:33:46 +0000 Subject: [PATCH 219/242] pinned memory --- .../streamlined_sequential_pobtax_gpu.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 1b879ed3..35d2ce5a 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -118,6 +118,23 @@ def sequential_dataset( toc = time.perf_counter() print(f"Init device arrays took: {toc - tic:.5f} sec", flush=True) + A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks_cpu) + A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks_cpu[:, :, :] + A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks_cpu) + A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks_cpu[:, :, :] + A_lower_arrow_blocks_pinned = cpx.zeros_like_pinned(A_lower_arrow_blocks_cpu) + A_lower_arrow_blocks_pinned[:, :, :] = A_lower_arrow_blocks_cpu[:, :, :] + A_arrow_tip_block_pinned = cpx.zeros_like_pinned(A_arrow_tip_block_cpu) + A_arrow_tip_block_pinned[:, :] = A_arrow_tip_block_cpu[:, :] + B_pinned = cpx.zeros_like_pinned(B_cpu) + B_pinned[:, :] = B_cpu[:, :] + + A_diagonal_blocks_cpu = A_diagonal_blocks_pinned + A_lower_diagonal_blocks_cpu = A_lower_diagonal_blocks_pinned + A_lower_arrow_blocks_cpu = A_lower_arrow_blocks_pinned + A_arrow_tip_block_cpu = A_arrow_tip_block_pinned + B = B_pinned + t_pobtaf = [] t_pobtas = [] t_pobtasi = [] From 5faecf625929a7f0ed6071427c6e9b8de650fcf3 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 11:43:12 +0000 Subject: [PATCH 220/242] typo --- .../streamlined_sequential_pobtax_gpu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 35d2ce5a..49537dfa 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -118,15 +118,15 @@ def sequential_dataset( toc = time.perf_counter() print(f"Init device arrays took: {toc - tic:.5f} sec", flush=True) - A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks_cpu) + A_diagonal_blocks_pinned = cp.zeros_like_pinned(A_diagonal_blocks_cpu) A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks_cpu[:, :, :] - A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks_cpu) + A_lower_diagonal_blocks_pinned = cp.zeros_like_pinned(A_lower_diagonal_blocks_cpu) A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks_cpu[:, :, :] - A_lower_arrow_blocks_pinned = cpx.zeros_like_pinned(A_lower_arrow_blocks_cpu) + A_lower_arrow_blocks_pinned = cp.zeros_like_pinned(A_lower_arrow_blocks_cpu) A_lower_arrow_blocks_pinned[:, :, :] = A_lower_arrow_blocks_cpu[:, :, :] - A_arrow_tip_block_pinned = cpx.zeros_like_pinned(A_arrow_tip_block_cpu) + A_arrow_tip_block_pinned = cp.zeros_like_pinned(A_arrow_tip_block_cpu) A_arrow_tip_block_pinned[:, :] = A_arrow_tip_block_cpu[:, :] - B_pinned = cpx.zeros_like_pinned(B_cpu) + B_pinned = cp.zeros_like_pinned(B_cpu) B_pinned[:, :] = B_cpu[:, :] A_diagonal_blocks_cpu = A_diagonal_blocks_pinned From e668184223285e83aee1d3d293ad8fc7dc25948c Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 11:44:16 +0000 Subject: [PATCH 221/242] changed block name --- .../positive_definite/streamlined_sequential_pobtax_gpu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 49537dfa..5aef4518 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -122,8 +122,8 @@ def sequential_dataset( A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks_cpu[:, :, :] A_lower_diagonal_blocks_pinned = cp.zeros_like_pinned(A_lower_diagonal_blocks_cpu) A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks_cpu[:, :, :] - A_lower_arrow_blocks_pinned = cp.zeros_like_pinned(A_lower_arrow_blocks_cpu) - A_lower_arrow_blocks_pinned[:, :, :] = A_lower_arrow_blocks_cpu[:, :, :] + A_lower_arrow_blocks_pinned = cp.zeros_like_pinned(A_arrow_bottom_blocks_cpu) + A_lower_arrow_blocks_pinned[:, :, :] = A_arrow_bottom_blocks_cpu[:, :, :] A_arrow_tip_block_pinned = cp.zeros_like_pinned(A_arrow_tip_block_cpu) A_arrow_tip_block_pinned[:, :] = A_arrow_tip_block_cpu[:, :] B_pinned = cp.zeros_like_pinned(B_cpu) @@ -131,7 +131,7 @@ def sequential_dataset( A_diagonal_blocks_cpu = A_diagonal_blocks_pinned A_lower_diagonal_blocks_cpu = A_lower_diagonal_blocks_pinned - A_lower_arrow_blocks_cpu = A_lower_arrow_blocks_pinned + A_arrow_bottom_blocks_cpu = A_lower_arrow_blocks_pinned A_arrow_tip_block_cpu = A_arrow_tip_block_pinned B = B_pinned From 2a048b04f20faf3c25429f2073c7aecdfc174953 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 11:47:35 +0000 Subject: [PATCH 222/242] import cupyx --- .../streamlined_sequential_pobtax_gpu.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 5aef4518..b5b1833c 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -6,6 +6,7 @@ import numpy as np import cupy as cp from cupy.cuda.nvtx import RangePush, RangePop +import cupyx as cpx from serinv.algs import pobtaf, pobtas, pobtasi @@ -118,15 +119,15 @@ def sequential_dataset( toc = time.perf_counter() print(f"Init device arrays took: {toc - tic:.5f} sec", flush=True) - A_diagonal_blocks_pinned = cp.zeros_like_pinned(A_diagonal_blocks_cpu) + A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks_cpu) A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks_cpu[:, :, :] - A_lower_diagonal_blocks_pinned = cp.zeros_like_pinned(A_lower_diagonal_blocks_cpu) + A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks_cpu) A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks_cpu[:, :, :] - A_lower_arrow_blocks_pinned = cp.zeros_like_pinned(A_arrow_bottom_blocks_cpu) + A_lower_arrow_blocks_pinned = cpx.zeros_like_pinned(A_arrow_bottom_blocks_cpu) A_lower_arrow_blocks_pinned[:, :, :] = A_arrow_bottom_blocks_cpu[:, :, :] - A_arrow_tip_block_pinned = cp.zeros_like_pinned(A_arrow_tip_block_cpu) + A_arrow_tip_block_pinned = cpx.zeros_like_pinned(A_arrow_tip_block_cpu) A_arrow_tip_block_pinned[:, :] = A_arrow_tip_block_cpu[:, :] - B_pinned = cp.zeros_like_pinned(B_cpu) + B_pinned = cpx.zeros_like_pinned(B_cpu) B_pinned[:, :] = B_cpu[:, :] A_diagonal_blocks_cpu = A_diagonal_blocks_pinned From 5bc3bcbe62e3b08c6aec6163fa858302fba0efb2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 11:59:46 +0000 Subject: [PATCH 223/242] missing B_cpu --- .../positive_definite/streamlined_sequential_pobtax_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index b5b1833c..6eefb168 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -134,7 +134,7 @@ def sequential_dataset( A_lower_diagonal_blocks_cpu = A_lower_diagonal_blocks_pinned A_arrow_bottom_blocks_cpu = A_lower_arrow_blocks_pinned A_arrow_tip_block_cpu = A_arrow_tip_block_pinned - B = B_pinned + B_cpu = B_pinned t_pobtaf = [] t_pobtas = [] From 1cb143ccd529a6359d2287c08f111e52f075caca Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 12:33:38 +0000 Subject: [PATCH 224/242] changed nvtx --- .../streamlined_sequential_pobtax_gpu.py | 4 ++-- src/serinv/algs/pobtas.py | 22 ++++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 6eefb168..99e325f9 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -180,7 +180,7 @@ def sequential_dataset( #print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) cp.cuda.runtime.deviceSynchronize() - RangePush(f"pobtas: i:{i}") + # RangePush(f"pobtas: i:{i}") tic = time.perf_counter() pobtas( A_diagonal_blocks_cpu, @@ -192,7 +192,7 @@ def sequential_dataset( ) cp.cuda.runtime.deviceSynchronize() toc = time.perf_counter() - RangePop() + # RangePop() elapsed = toc - tic print(f"pobtas took: {elapsed:.5f} sec", flush=True) if i >= n_warmups: diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index a6a528f6..07cb064b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -7,6 +7,9 @@ _get_module_from_str, ) +from cupy.cuda.nvtx import RangePush, RangePop + + def pobtas( L_diagonal_blocks: ArrayLike, @@ -239,7 +242,7 @@ def _pobtas_streaming( "Host<->Device streaming only works when host-arrays are given." ) - print("streaming") + cp, cu_la = _get_module_from_str(module_str="cupy") @@ -280,7 +283,7 @@ def _pobtas_streaming( if trans == "N": # ----- Forward substitution ----- - + RangePush(f"pobtas: startup") # Delete helper variable del B_shape @@ -327,27 +330,30 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_lower_diagonal_events[0].record(stream=h2d_stream) + RangePop() # --- Computations --- for i in range(0, n_diag_blocks - 1): # pass next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) + RangePush(f"pobtas: streaming B {i+1}") B_d[(i + 1) % 2].set( arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream = h2d_stream ) + RangePop() h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) if i + 1 < n_diag_blocks - 1: # pass next diagonal block h2d_stream.wait_event(compute_current_B_events[(i + 1) % 2]) - + RangePush(f"pobtas: streaming diag blocks {i+1}") L_diagonal_blocks_d[(i + 1) % 2].set( arr=L_diagonal_blocks[i + 1], stream=h2d_stream ) - + RangePop() h2d_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) @@ -377,12 +383,12 @@ def _pobtas_streaming( if i + 1 < n_diag_blocks - 1: # Pass next lower diagonal block h2d_stream.wait_event(compute_next_B_events[(i + 1) % 2]) - + RangePush(f"pobtas: streaming lower diag blocks {i+1}") L_lower_diagonal_blocks_d[(i + 1) % 2].set( arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream ) - + RangePop() h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: @@ -399,12 +405,12 @@ def _pobtas_streaming( if i + 1 < n_diag_blocks - 1: # Pass next lower arrow block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) - + RangePush(f"pobtas: streaming lower arrow blocks{i}") L_lower_arrow_blocks_d[(i + 1) % 2].set( arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream ) - + RangePop() h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: From 35e5bf928242c415a35013b6b0f0c098a2feda3a Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 12:40:55 +0000 Subject: [PATCH 225/242] moved pop --- src/serinv/algs/pobtas.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 07cb064b..1446755d 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -342,8 +342,8 @@ def _pobtas_streaming( stream = h2d_stream ) - RangePop() h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) + RangePop() if i + 1 < n_diag_blocks - 1: # pass next diagonal block @@ -353,8 +353,9 @@ def _pobtas_streaming( arr=L_diagonal_blocks[i + 1], stream=h2d_stream ) - RangePop() + h2d_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) + RangePop() with compute_stream: @@ -388,8 +389,9 @@ def _pobtas_streaming( arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream ) - RangePop() + h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) + RangePop() with compute_stream: # Update next B block @@ -410,8 +412,9 @@ def _pobtas_streaming( arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream ) - RangePop() + h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) + RangePop() with compute_stream: # Update arrow tip From 01f4b24ff6bb354950cded62efb7e7ccd9285b0f Mon Sep 17 00:00:00 2001 From: 03szust Date: Thu, 15 May 2025 14:57:44 +0000 Subject: [PATCH 226/242] untangled streaming --- .../streamlined_sequential_pobtax_gpu.py | 17 ++++++++------- src/serinv/algs/pobtas.py | 21 +++++++++++-------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py index 99e325f9..ddb479a2 100644 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py @@ -136,6 +136,7 @@ def sequential_dataset( A_arrow_tip_block_cpu = A_arrow_tip_block_pinned B_cpu = B_pinned + t_pobtaf = [] t_pobtas = [] t_pobtasi = [] @@ -143,14 +144,14 @@ def sequential_dataset( for i in range(n_warmups + n_iterations): print(f"Iteration: {i+1}/{n_warmups+n_iterations}", flush=True) - tic = time.perf_counter() - A_diagonal_blocks_gpu.set(arr=A_diagonal_blocks_cpu) - A_lower_diagonal_blocks_gpu.set(arr=A_lower_diagonal_blocks_cpu) - A_arrow_bottom_blocks_gpu.set(arr=A_arrow_bottom_blocks_cpu) - A_arrow_tip_block_gpu.set(arr=A_arrow_tip_block_cpu) - B_gpu.set(arr=B_cpu) - toc = time.perf_counter() - print(f"Copying data to GPU took: {toc - tic:.5f} sec", flush=True) + #tic = time.perf_counter() + #A_diagonal_blocks_gpu.set(arr=A_diagonal_blocks_cpu) + #A_lower_diagonal_blocks_gpu.set(arr=A_lower_diagonal_blocks_cpu) + #A_arrow_bottom_blocks_gpu.set(arr=A_arrow_bottom_blocks_cpu) + #A_arrow_tip_block_gpu.set(arr=A_arrow_tip_block_cpu) + #B_gpu.set(arr=B_cpu) + #toc = time.perf_counter() + #print(f"Copying data to GPU took: {toc - tic:.5f} sec", flush=True) cp.cuda.runtime.deviceSynchronize() RangePush(f"pobtaf: i:{i}") diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 1446755d..d8f20d1b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -371,15 +371,7 @@ def _pobtas_streaming( compute_current_B_events[i % 2].record(stream=compute_stream) # Pass current B block back - d2h_stream.wait_event(compute_current_B_events[i % 2]) - - B_d[i % 2].get( - out=B[i * diag_blocksize : (i + 1) * diag_blocksize], - stream=d2h_stream, - blocking=False, - ) - - d2h_B_events[i % 2].record(stream=d2h_stream) + if i + 1 < n_diag_blocks - 1: # Pass next lower diagonal block @@ -392,6 +384,17 @@ def _pobtas_streaming( h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) RangePop() + + d2h_stream.wait_event(compute_current_B_events[i % 2]) + d2h_stream.wait_event(h2d_lower_diagonal_events[(i+1) % 2]) + + B_d[i % 2].get( + out=B[i * diag_blocksize : (i + 1) * diag_blocksize], + stream=d2h_stream, + blocking=False, + ) + + d2h_B_events[i % 2].record(stream=d2h_stream) with compute_stream: # Update next B block From c63cd2c0da0e075203c72e2930d011c6e674e8dd Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:32:40 +0000 Subject: [PATCH 227/242] modified tests --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index e9ce2384..54a0bde5 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -3,15 +3,29 @@ import numpy as np import pytest +from conftest import ARRAY_TYPE as ARRAY_TYPE + from serinv import backend_flags, _get_module_from_array from ....testing_utils import bta_dense_to_arrays, dd_bta, symmetrize, rhs from serinv.algs import pobtaf, pobtas +if backend_flags["cupy_avail"]: + ARRAY_TYPE.extend( + [ + + pytest.param("streaming", id="streaming"), + ] + ) + if backend_flags["cupy_avail"]: import cupyx as cpx +@pytest.fixture(params=ARRAY_TYPE, autouse=True) +def array_type(request: pytest.FixtureRequest) -> str: + return request.param + @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobtas( @@ -22,7 +36,6 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - array_type = "streaming" A = dd_bta( diagonal_blocksize, From a3905e277b4ee8dc0a12e457116d0bff96f6eb79 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:34:34 +0000 Subject: [PATCH 228/242] pytest array_type override --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 54a0bde5..76e50089 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -3,17 +3,19 @@ import numpy as np import pytest -from conftest import ARRAY_TYPE as ARRAY_TYPE - from serinv import backend_flags, _get_module_from_array from ....testing_utils import bta_dense_to_arrays, dd_bta, symmetrize, rhs from serinv.algs import pobtaf, pobtas + +ARRAY_TYPE = [ + pytest.param("host", id="host"), +] if backend_flags["cupy_avail"]: ARRAY_TYPE.extend( [ - + pytest.param("device", id="device"), pytest.param("streaming", id="streaming"), ] ) From 6913677f503f858247e40f1f90bade9487feb277 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:37:36 +0000 Subject: [PATCH 229/242] changed tests a bit to not override --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 76e50089..3685c174 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -3,19 +3,16 @@ import numpy as np import pytest +from ....conftest import ARRAY_TYPE as ARRAY_TYPE + from serinv import backend_flags, _get_module_from_array from ....testing_utils import bta_dense_to_arrays, dd_bta, symmetrize, rhs from serinv.algs import pobtaf, pobtas - -ARRAY_TYPE = [ - pytest.param("host", id="host"), -] if backend_flags["cupy_avail"]: ARRAY_TYPE.extend( [ - pytest.param("device", id="device"), pytest.param("streaming", id="streaming"), ] ) From f734dc6df6ba619977e44e3b34ac325985faeec8 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:40:08 +0000 Subject: [PATCH 230/242] activate pobtaf streaming in tests --- tests/tests_algs/regular/tests_bta/test_pobtas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index 3685c174..d58e3a19 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -91,6 +91,7 @@ def test_pobtas( A_lower_diagonal_blocks, A_lower_arrow_blocks, A_arrow_tip_block, + device_streaming=True if array_type == "streaming" else False, ) # Forward solve: Y=L^{-1}B From 5924e1b2e8269526ec1f1b24740353e7fdf8b9f6 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:43:51 +0000 Subject: [PATCH 231/242] removed nvtx and tests the tests --- src/serinv/algs/pobtas.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index d8f20d1b..9225bd7b 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -7,9 +7,6 @@ _get_module_from_str, ) -from cupy.cuda.nvtx import RangePush, RangePop - - def pobtas( L_diagonal_blocks: ArrayLike, @@ -51,6 +48,9 @@ def pobtas( else: # Natural arrowhead if device_streaming: + raise NotImplementedError( + "Test testing." + ) _pobtas_streaming( L_diagonal_blocks, L_lower_diagonal_blocks, @@ -283,7 +283,6 @@ def _pobtas_streaming( if trans == "N": # ----- Forward substitution ----- - RangePush(f"pobtas: startup") # Delete helper variable del B_shape @@ -330,12 +329,12 @@ def _pobtas_streaming( L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) h2d_lower_diagonal_events[0].record(stream=h2d_stream) - RangePop() + # --- Computations --- for i in range(0, n_diag_blocks - 1): # pass next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) - RangePush(f"pobtas: streaming B {i+1}") + B_d[(i + 1) % 2].set( arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], @@ -343,19 +342,17 @@ def _pobtas_streaming( ) h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) - RangePop() + if i + 1 < n_diag_blocks - 1: # pass next diagonal block h2d_stream.wait_event(compute_current_B_events[(i + 1) % 2]) - RangePush(f"pobtas: streaming diag blocks {i+1}") L_diagonal_blocks_d[(i + 1) % 2].set( arr=L_diagonal_blocks[i + 1], stream=h2d_stream ) h2d_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) - RangePop() with compute_stream: @@ -376,14 +373,12 @@ def _pobtas_streaming( if i + 1 < n_diag_blocks - 1: # Pass next lower diagonal block h2d_stream.wait_event(compute_next_B_events[(i + 1) % 2]) - RangePush(f"pobtas: streaming lower diag blocks {i+1}") L_lower_diagonal_blocks_d[(i + 1) % 2].set( arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream ) h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) - RangePop() d2h_stream.wait_event(compute_current_B_events[i % 2]) d2h_stream.wait_event(h2d_lower_diagonal_events[(i+1) % 2]) @@ -410,14 +405,12 @@ def _pobtas_streaming( if i + 1 < n_diag_blocks - 1: # Pass next lower arrow block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) - RangePush(f"pobtas: streaming lower arrow blocks{i}") L_lower_arrow_blocks_d[(i + 1) % 2].set( arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream ) h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) - RangePop() with compute_stream: # Update arrow tip From c259ccaa895171bd88839d68ba88cd0e183376d4 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:44:28 +0000 Subject: [PATCH 232/242] removed test testing --- src/serinv/algs/pobtas.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 9225bd7b..1575a4ac 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -48,9 +48,6 @@ def pobtas( else: # Natural arrowhead if device_streaming: - raise NotImplementedError( - "Test testing." - ) _pobtas_streaming( L_diagonal_blocks, L_lower_diagonal_blocks, From 4cf986f18000a910a23782f1f8a09f1e71d056b1 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:48:25 +0000 Subject: [PATCH 233/242] expanded tests --- tests/tests_algs/regular/tests_bt/test_pobts.py | 9 +++++++++ tests/tests_algs/regular/tests_bta/test_pobtas.py | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index f5c941dc..0f9835a6 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -3,11 +3,20 @@ import numpy as np import pytest +from ....conftest import ARRAY_TYPE as ARRAY_TYPE + from serinv import backend_flags, _get_module_from_array from ....testing_utils import bt_dense_to_arrays, dd_bt, symmetrize, rhs from serinv.algs import pobtf, pobts +if backend_flags["cupy_avail"]: + ARRAY_TYPE.extend( + [ + pytest.param("streaming", id="streaming"), + ] + ) + if backend_flags["cupy_avail"]: import cupyx as cpx diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index d58e3a19..a94040f0 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -21,9 +21,9 @@ import cupyx as cpx -@pytest.fixture(params=ARRAY_TYPE, autouse=True) -def array_type(request: pytest.FixtureRequest) -> str: - return request.param +#@pytest.fixture(params=ARRAY_TYPE, autouse=True) +#def array_type(request: pytest.FixtureRequest) -> str: +# return request.param @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) From 632b74ccdb3efcef5fee12924179e8f575c2c12f Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:49:51 +0000 Subject: [PATCH 234/242] expanded tests further --- tests/tests_algs/regular/tests_bt/test_pobts.py | 5 +++++ tests/tests_algs/regular/tests_bta/test_pobtas.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index 0f9835a6..f474c6b4 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -20,6 +20,11 @@ if backend_flags["cupy_avail"]: import cupyx as cpx + +@pytest.fixture(params=ARRAY_TYPE, autouse=True) +def array_type(request: pytest.FixtureRequest) -> str: + return request.param + @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobts( diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index a94040f0..d58e3a19 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -21,9 +21,9 @@ import cupyx as cpx -#@pytest.fixture(params=ARRAY_TYPE, autouse=True) -#def array_type(request: pytest.FixtureRequest) -> str: -# return request.param +@pytest.fixture(params=ARRAY_TYPE, autouse=True) +def array_type(request: pytest.FixtureRequest) -> str: + return request.param @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) From 680d8990654eb737956a92b619eb98e6967944b2 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 07:51:20 +0000 Subject: [PATCH 235/242] activated streaming tests for pobtaf --- tests/tests_algs/regular/tests_bta/test_pobtaf.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/tests_algs/regular/tests_bta/test_pobtaf.py b/tests/tests_algs/regular/tests_bta/test_pobtaf.py index a30b9094..98756357 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtaf.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtaf.py @@ -3,15 +3,28 @@ import numpy as np import pytest +from ....conftest import ARRAY_TYPE as ARRAY_TYPE + from serinv import backend_flags, _get_module_from_array from ....testing_utils import bta_dense_to_arrays, dd_bta, symmetrize from serinv.algs import pobtaf +if backend_flags["cupy_avail"]: + ARRAY_TYPE.extend( + [ + pytest.param("streaming", id="streaming"), + ] + ) + if backend_flags["cupy_avail"]: import cupyx as cpx +@pytest.fixture(params=ARRAY_TYPE, autouse=True) +def array_type(request: pytest.FixtureRequest) -> str: + return request.param + @pytest.mark.mpi_skip() def test_pobtaf( diagonal_blocksize: int, From 10de2c5f37536c12cd5f96956b0af0ae061eef4a Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 08:04:03 +0000 Subject: [PATCH 236/242] removed leftover cscs scripts --- run_streamlined_sequential_pobtax_gpu.sh | 51 ---- ...d_sequential_pobtax_gpu.sh:Zone.Identifier | 3 - .../streamlined_sequential_pobtax_gpu.py | 234 ------------------ ...d_sequential_pobtax_gpu.py:Zone.Identifier | 3 - 4 files changed, 291 deletions(-) delete mode 100644 run_streamlined_sequential_pobtax_gpu.sh delete mode 100644 sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier delete mode 100644 sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py delete mode 100644 sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier diff --git a/run_streamlined_sequential_pobtax_gpu.sh b/run_streamlined_sequential_pobtax_gpu.sh deleted file mode 100644 index 74b5d9db..00000000 --- a/run_streamlined_sequential_pobtax_gpu.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name="serinv_pobtx_benchmark" -#SBATCH --output=%x.%j.out -#SBATCH --error=%x.%j.err -#SBATCH --account=lp16 -#SBATCH --time=00:10:00 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=64 -#SBATCH --gpus-per-task=1 -#SBATCH --partition=debug -#SBATCH --constraint=gpu -#SBATCH --hint=nomultithread -#SBATCH --uenv=prgenv-gnu/24.11:v1 -#SBATCH --view=modules - -set -e -u - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export OMP_PLACES=cores -export OMP_PROC_BIND=close - -export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID - -# source ~/load_modules.sh -conda activate serinv_env - -# Dataset 1: b = 1675, a = 6, n = 128 -# Reference timings (to beat!): -# - pobtaf: 0.38959 -# - pobtas: 0.02415 -# - pobtasi: 0.29593 -# export b=1675 -# export a=6 -# export n=128 - -# Dataset 2: b = 4002, a = 6, n = 250 -# Reference timings (to beat!): -# - pobtaf: 3.2716 (INLA_BTA CUDA code: 2.713) -# - pobtas: 0.15397 -# - pobtasi: 5.15729 -export b=4002 -export a=6 -export n=250 - -# Benchmark the code -srun python ~/serinv/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n - -# Profile the code -# srun nsys profile --force-overwrite=true -o profile_serinv_pobtax_b${b}_a${a}_n${n} python ~/repositories/serinv/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py --b $b --a $a --n $n --b $b --a $a --n $n \ No newline at end of file diff --git a/sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier b/sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier deleted file mode 100644 index 33e02d64..00000000 --- a/sc25_runs/positive_definite/run_streamlined_sequential_pobtax_gpu.sh:Zone.Identifier +++ /dev/null @@ -1,3 +0,0 @@ -[ZoneTransfer] -ZoneId=3 -HostUrl=https://iis-mattermost.ee.ethz.ch/api/v4/files/waiggpk1miyeb84dcahdh53b1e?download=1 diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py deleted file mode 100644 index ddb479a2..00000000 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py +++ /dev/null @@ -1,234 +0,0 @@ -import time - -tic = time.perf_counter() -import argparse - -import numpy as np -import cupy as cp -from cupy.cuda.nvtx import RangePush, RangePop -import cupyx as cpx - -from serinv.algs import pobtaf, pobtas, pobtasi - - -def sequential_dataset( - n_blocks: int, - diagonal_blocksize: int, - arrowhead_blocksize: int, -): - A_diagonal_blocks = np.random.rand(n_blocks, diagonal_blocksize, diagonal_blocksize) - A_lower_diagonal_blocks = np.random.rand( - n_blocks - 1, diagonal_blocksize, diagonal_blocksize - ) - A_arrow_bottom_blocks = np.random.rand( - n_blocks, arrowhead_blocksize, diagonal_blocksize - ) - A_arrow_tip_block = np.random.rand(arrowhead_blocksize, arrowhead_blocksize) - - # CODE TO MODIFY - arrow_colsum = np.zeros((arrowhead_blocksize), dtype=A_diagonal_blocks.dtype) - for i in range(A_diagonal_blocks.shape[0]): - colsum = np.sum(A_diagonal_blocks[i, :, :], axis=1) - np.diag( - A_diagonal_blocks[i, :, :] - ) - if i > 0: - colsum += np.sum(A_lower_diagonal_blocks[i - 1, :, :], axis=1) - - A_diagonal_blocks[i, :, :] += np.diag(colsum) - - arrow_colsum[:] += np.sum(A_arrow_bottom_blocks[i, :, :], axis=1) - - A_arrow_tip_block[:, :] += np.diag( - arrow_colsum + np.sum(A_arrow_tip_block[:, :], axis=1) - ) - - return ( - A_diagonal_blocks, - A_lower_diagonal_blocks, - A_arrow_bottom_blocks, - A_arrow_tip_block, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process some integers.") - parser.add_argument( - "--b", - type=int, - default=128, - help="an integer for the diagonal block size", - ) - parser.add_argument( - "--a", - type=int, - default=0, - help="an integer for the diagonal block size", - ) - parser.add_argument( - "--n", - type=int, - default=8, - help="an integer for the number of diagonal blocks", - ) - args = parser.parse_args() - toc = time.perf_counter() - print(f"Import and parsing took: {toc - tic:.5f} sec", flush=True) - - diagonal_blocksize = args.b - arrowhead_blocksize = args.a - n_blocks = args.n - n_iterations = 10 - n_warmups = 2 - - tic = time.perf_counter() - ( - A_diagonal_blocks_cpu, - A_lower_diagonal_blocks_cpu, - A_arrow_bottom_blocks_cpu, - A_arrow_tip_block_cpu, - ) = sequential_dataset( - n_blocks, - diagonal_blocksize, - arrowhead_blocksize, - ) - B_cpu = np.random.rand(diagonal_blocksize * n_blocks + arrowhead_blocksize, 1) - toc = time.perf_counter() - print(f"Generate dataset took: {toc - tic:.5f} sec", flush=True) - print(f" b = {diagonal_blocksize}", flush=True) - print(f" a = {arrowhead_blocksize}", flush=True) - print(f" n = {n_blocks}", flush=True) - print(f" n_iterations = {n_iterations}", flush=True) - print(f" n_warmups = {n_warmups}", flush=True) - - total_memory = ( - A_diagonal_blocks_cpu.nbytes - + A_lower_diagonal_blocks_cpu.nbytes - + A_arrow_bottom_blocks_cpu.nbytes - + A_arrow_tip_block_cpu.nbytes - + B_cpu.nbytes - ) - print(f" Total memory: {total_memory / 1e9:.5f} GB", flush=True) - - tic = time.perf_counter() - # Init device arrays - A_diagonal_blocks_gpu = cp.empty_like(A_diagonal_blocks_cpu) - A_lower_diagonal_blocks_gpu = cp.empty_like(A_lower_diagonal_blocks_cpu) - A_arrow_bottom_blocks_gpu = cp.empty_like(A_arrow_bottom_blocks_cpu) - A_arrow_tip_block_gpu = cp.empty_like(A_arrow_tip_block_cpu) - B_gpu = cp.empty_like(B_cpu) - toc = time.perf_counter() - print(f"Init device arrays took: {toc - tic:.5f} sec", flush=True) - - A_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_diagonal_blocks_cpu) - A_diagonal_blocks_pinned[:, :, :] = A_diagonal_blocks_cpu[:, :, :] - A_lower_diagonal_blocks_pinned = cpx.zeros_like_pinned(A_lower_diagonal_blocks_cpu) - A_lower_diagonal_blocks_pinned[:, :, :] = A_lower_diagonal_blocks_cpu[:, :, :] - A_lower_arrow_blocks_pinned = cpx.zeros_like_pinned(A_arrow_bottom_blocks_cpu) - A_lower_arrow_blocks_pinned[:, :, :] = A_arrow_bottom_blocks_cpu[:, :, :] - A_arrow_tip_block_pinned = cpx.zeros_like_pinned(A_arrow_tip_block_cpu) - A_arrow_tip_block_pinned[:, :] = A_arrow_tip_block_cpu[:, :] - B_pinned = cpx.zeros_like_pinned(B_cpu) - B_pinned[:, :] = B_cpu[:, :] - - A_diagonal_blocks_cpu = A_diagonal_blocks_pinned - A_lower_diagonal_blocks_cpu = A_lower_diagonal_blocks_pinned - A_arrow_bottom_blocks_cpu = A_lower_arrow_blocks_pinned - A_arrow_tip_block_cpu = A_arrow_tip_block_pinned - B_cpu = B_pinned - - - t_pobtaf = [] - t_pobtas = [] - t_pobtasi = [] - - for i in range(n_warmups + n_iterations): - print(f"Iteration: {i+1}/{n_warmups+n_iterations}", flush=True) - - #tic = time.perf_counter() - #A_diagonal_blocks_gpu.set(arr=A_diagonal_blocks_cpu) - #A_lower_diagonal_blocks_gpu.set(arr=A_lower_diagonal_blocks_cpu) - #A_arrow_bottom_blocks_gpu.set(arr=A_arrow_bottom_blocks_cpu) - #A_arrow_tip_block_gpu.set(arr=A_arrow_tip_block_cpu) - #B_gpu.set(arr=B_cpu) - #toc = time.perf_counter() - #print(f"Copying data to GPU took: {toc - tic:.5f} sec", flush=True) - - cp.cuda.runtime.deviceSynchronize() - RangePush(f"pobtaf: i:{i}") - tic = time.perf_counter() - pobtaf( - A_diagonal_blocks_cpu, - A_lower_diagonal_blocks_cpu, - A_arrow_bottom_blocks_cpu, - A_arrow_tip_block_cpu, - device_streaming=True - ) - cp.cuda.runtime.deviceSynchronize() - toc = time.perf_counter() - RangePop() - elapsed = toc - tic - print(f"pobtaf took: {elapsed:.5f} sec", flush=True) - if i >= n_warmups: - t_pobtaf.append(elapsed) - - #tic = time.perf_counter() - #A_diagonal_blocks_gpu.get(out=A_diagonal_blocks_cpu) - #A_lower_diagonal_blocks_gpu.get(out=A_lower_diagonal_blocks_cpu) - #A_arrow_bottom_blocks_gpu.get(out=A_arrow_bottom_blocks_cpu) - #A_arrow_tip_block_gpu.get(out=A_arrow_tip_block_cpu) - #B_gpu.get(out=B_cpu) - #toc = time.perf_counter() - #print(f"Copying data from GPU took: {toc - tic:.5f} sec", flush=True) - - cp.cuda.runtime.deviceSynchronize() - # RangePush(f"pobtas: i:{i}") - tic = time.perf_counter() - pobtas( - A_diagonal_blocks_cpu, - A_lower_diagonal_blocks_cpu, - A_arrow_bottom_blocks_cpu, - A_arrow_tip_block_cpu, - B_cpu, - device_streaming=True - ) - cp.cuda.runtime.deviceSynchronize() - toc = time.perf_counter() - # RangePop() - elapsed = toc - tic - print(f"pobtas took: {elapsed:.5f} sec", flush=True) - if i >= n_warmups: - t_pobtas.append(elapsed) - - tic = time.perf_counter() - A_diagonal_blocks_gpu.set(arr=A_diagonal_blocks_cpu) - A_lower_diagonal_blocks_gpu.set(arr=A_lower_diagonal_blocks_cpu) - A_arrow_bottom_blocks_gpu.set(arr=A_arrow_bottom_blocks_cpu) - A_arrow_tip_block_gpu.set(arr=A_arrow_tip_block_cpu) - B_gpu.set(arr=B_cpu) - toc = time.perf_counter() - print(f"Copying data to GPU took: {toc - tic:.5f} sec", flush=True) - - cp.cuda.runtime.deviceSynchronize() - RangePush(f"pobtasi: i:{i}") - tic = time.perf_counter() - pobtasi( - A_diagonal_blocks_gpu, - A_lower_diagonal_blocks_gpu, - A_arrow_bottom_blocks_gpu, - A_arrow_tip_block_gpu, - ) - cp.cuda.runtime.deviceSynchronize() - toc = time.perf_counter() - RangePop() - elapsed = toc - tic - print(f"pobtasi took: {elapsed:.5f} sec", flush=True) - if i >= n_warmups: - t_pobtasi.append(elapsed) - - print(f"t_pobtaf: {t_pobtaf}", flush=True) - print(f"t_pobtas: {t_pobtas}", flush=True) - print(f"t_pobtasi: {t_pobtasi}", flush=True) - - print(f"avg t_pobtaf: {np.mean(np.array(t_pobtaf)):.5f} sec", flush=True) - print(f"avg t_pobtas: {np.mean(np.array(t_pobtas)):.5f} sec", flush=True) - print(f"avg t_pobtasi: {np.mean(np.array(t_pobtasi)):.5f} sec", flush=True) \ No newline at end of file diff --git a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier b/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier deleted file mode 100644 index ce8dec59..00000000 --- a/sc25_runs/positive_definite/streamlined_sequential_pobtax_gpu.py:Zone.Identifier +++ /dev/null @@ -1,3 +0,0 @@ -[ZoneTransfer] -ZoneId=3 -HostUrl=https://iis-mattermost.ee.ethz.ch/api/v4/files/fw5m5tapefbi8deseto5qqro9w?download=1 From 7390a6b56c8f59efab0850366f331ad239501350 Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 16 May 2025 08:22:14 +0000 Subject: [PATCH 237/242] removed line that forced streaming --- tests/tests_algs/regular/tests_bt/test_pobts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index f474c6b4..d137c796 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -34,7 +34,6 @@ def test_pobts( array_type: str, dtype: np.dtype, ): - array_type = "streaming" A = dd_bt( diagonal_blocksize, From c8f89e2e351fc7519e284051bd41b590eba4f6d1 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 27 May 2025 14:46:12 +0000 Subject: [PATCH 238/242] first modification to get cupy and scipy implementations for trsm right and left hand side --- src/serinv/utils/trsm_solve_device.py | 103 ++++++++++++++++++++++ src/serinv/utils/trsm_solve_host.py | 122 ++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 src/serinv/utils/trsm_solve_device.py create mode 100644 src/serinv/utils/trsm_solve_host.py diff --git a/src/serinv/utils/trsm_solve_device.py b/src/serinv/utils/trsm_solve_device.py new file mode 100644 index 00000000..9d358a1c --- /dev/null +++ b/src/serinv/utils/trsm_solve_device.py @@ -0,0 +1,103 @@ +import numpy + +from cupy.cuda import cublas +from cupy.cuda import device +from cupy.linalg import _util + + +def solve_triangular_device(a, b, trans=0, lower=False, unit_diagonal=False, + overwrite_b=False, check_finite=False, aplha = 1., side=0): + """Solve the equation a x = b for x, assuming a is a triangular matrix. + + Args: + a (cupy.ndarray): The matrix with dimension ``(M, M)``. + b (cupy.ndarray): The matrix with dimension ``(M,)`` or + ``(M, N)``. + lower (bool): Use only data contained in the lower triangle of ``a``. + Default is to use upper triangle. + trans (0, 1, 2, 'N', 'T' or 'C'): Type of system to solve: + + - *'0'* or *'N'* -- :math:`a x = b` + - *'1'* or *'T'* -- :math:`a^T x = b` + - *'2'* or *'C'* -- :math:`a^H x = b` + + unit_diagonal (bool): If ``True``, diagonal elements of ``a`` are + assumed to be 1 and will not be referenced. + overwrite_b (bool): Allow overwriting data in b (may enhance + performance) + check_finite (bool): Whether to check that the input matrices contain + only finite numbers. Disabling may give a performance gain, but may + result in problems (crashes, non-termination) if the inputs do + contain infinities or NaNs. + + Returns: + cupy.ndarray: + The matrix with dimension ``(M,)`` or ``(M, N)``. + + .. seealso:: :func:`scipy.linalg.solve_triangular` + """ + + _util._assert_cupy_array(a, b) + + if len(a.shape) != 2 or a.shape[0] != a.shape[1]: + raise ValueError('expected square matrix') + if len(a) != len(b): + raise ValueError('incompatible dimensions') + + # Cast to float32 or float64 + if a.dtype.char in 'fd': + dtype = a.dtype + else: + dtype = numpy.promote_types(a.dtype.char, 'f') + + a = cupy.array(a, dtype=dtype, order='F', copy=False) + b = cupy.array(b, dtype=dtype, order='F', copy=(not overwrite_b)) + + if check_finite: + if a.dtype.kind == 'f' and not cupy.isfinite(a).all(): + raise ValueError( + 'array must not contain infs or NaNs') + if b.dtype.kind == 'f' and not cupy.isfinite(b).all(): + raise ValueError( + 'array must not contain infs or NaNs') + + m, n = (b.size, 1) if b.ndim == 1 else b.shape + cublas_handle = device.get_cublas_handle() + + if dtype == 'f': + trsm = cublas.strsm + elif dtype == 'd': + trsm = cublas.dtrsm + elif dtype == 'F': + trsm = cublas.ctrsm + else: # dtype == 'D' + trsm = cublas.ztrsm + one = numpy.array(1, dtype=dtype) + + if lower: + uplo = cublas.CUBLAS_FILL_MODE_LOWER + else: + uplo = cublas.CUBLAS_FILL_MODE_UPPER + + if trans == 'N': + trans = cublas.CUBLAS_OP_N + elif trans == 'T': + trans = cublas.CUBLAS_OP_T + elif trans == 'C': + trans = cublas.CUBLAS_OP_C + + if unit_diagonal: + diag = cublas.CUBLAS_DIAG_UNIT + else: + diag = cublas.CUBLAS_DIAG_NON_UNIT + + if side: + blas_side = cublas.CUBLAS_SIDE_RIGHT + else: + blas_side = cublas.CUBLAS_SIDE_LEFT + + trsm( + cublas_handle, blas_side, uplo, + trans, diag, + m, n, one.ctypes.data, a.data.ptr, m, b.data.ptr, m) + return b \ No newline at end of file diff --git a/src/serinv/utils/trsm_solve_host.py b/src/serinv/utils/trsm_solve_host.py new file mode 100644 index 00000000..820770e4 --- /dev/null +++ b/src/serinv/utils/trsm_solve_host.py @@ -0,0 +1,122 @@ +import numpy as np + + +from scipy.linalg.blas import get_blas_funcs +from scipy.linalg._misc import _datacopied +from scipy.linalg._decomp import _asarray_validated + +def solve_triangular_host(a, b, trans=0, lower=False, unit_diagonal=False, + overwrite_b=False, check_finite=True, side=0): + """ + Solve the equation ``a x = b`` for `x`, assuming a is a triangular matrix. + + Parameters + ---------- + a : (M, M) array_like + A triangular matrix + b : (M,) or (M, N) array_like + Right-hand side matrix in ``a x = b`` + lower : bool, optional + Use only data contained in the lower triangle of `a`. + Default is to use upper triangle. + trans : {0, 1, 2, 'N', 'T', 'C'}, optional + Type of system to solve: + + ======== ========= + trans system + ======== ========= + 0 or 'N' a x = b + 1 or 'T' a^T x = b + 2 or 'C' a^H x = b + ======== ========= + unit_diagonal : bool, optional + If True, diagonal elements of `a` are assumed to be 1 and + will not be referenced. + overwrite_b : bool, optional + Allow overwriting data in `b` (may enhance performance) + check_finite : bool, optional + Whether to check that the input matrices contain only finite numbers. + Disabling may give a performance gain, but may result in problems + (crashes, non-termination) if the inputs do contain infinities or NaNs. + + Returns + ------- + x : (M,) or (M, N) ndarray + Solution to the system ``a x = b``. Shape of return matches `b`. + + Raises + ------ + LinAlgError + If `a` is singular + + Notes + ----- + .. versionadded:: 0.9.0 + + Examples + -------- + Solve the lower triangular system a x = b, where:: + + [3 0 0 0] [4] + a = [2 1 0 0] b = [2] + [1 0 1 0] [4] + [1 1 1 1] [2] + + >>> import numpy as np + >>> from scipy.linalg import solve_triangular + >>> a = np.array([[3, 0, 0, 0], [2, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) + >>> b = np.array([4, 2, 4, 2]) + >>> x = solve_triangular(a, b, lower=True) + >>> x + array([ 1.33333333, -0.66666667, 2.66666667, -1.33333333]) + >>> a.dot(x) # Check the result + array([ 4., 2., 4., 2.]) + + """ + + a1 = _asarray_validated(a, check_finite=check_finite) + b1 = _asarray_validated(b, check_finite=check_finite) + + if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]: + raise ValueError('expected square matrix') + + if a1.shape[0] != b1.shape[0]: + raise ValueError(f'shapes of a {a1.shape} and b {b1.shape} are incompatible') + + # accommodate empty arrays + if b1.size == 0: + dt_nonempty = solve_triangular_host( + np.eye(2, dtype=a1.dtype), np.ones(2, dtype=b1.dtype) + ).dtype + return np.empty_like(b1, dtype=dt_nonempty) + + overwrite_b = overwrite_b or _datacopied(b1, b) + + x = _solve_triangular(a1, b1, trans, lower, unit_diagonal, overwrite_b, side) + return x + + +# solve_triangular without the input validation +def _solve_triangular(a1, b1, trans=0, lower=False, unit_diagonal=False, + overwrite_b=False, side=0): + + trans = {'N': 0, 'T': 1, 'C': 2}.get(trans, trans) + trsm, = get_blas_funcs(('trsm',), (a1, b1)) + + if a1.dtype.char in 'fd': + dtype = a1.dtype + else: + dtype = np.promote_types(a1.dtype.char, 'f') + + one = np.array(1, dtype=dtype) + alpha = one.ctypes.data + + if a1.flags.f_contiguous or trans == 2: + x = trsm(alpha, a1, b1, overwrite_b=overwrite_b, lower=lower, + trans_a=trans, diag=unit_diagonal, side=side) + else: + # transposed system is solved since trtrs expects Fortran ordering + x = trsm(alpha, a1.T, b1, overwrite_b=overwrite_b, lower=not lower, + trans_a=not trans, diag=unit_diagonal, side=side) + + return x From cbd07cdb59657d040939cbf6ff6d908bdab2ca09 Mon Sep 17 00:00:00 2001 From: 03szust Date: Tue, 27 May 2025 14:51:09 +0000 Subject: [PATCH 239/242] moved improvement files to new branch --- src/serinv/utils/trsm_solve_device.py | 103 ---------------------- src/serinv/utils/trsm_solve_host.py | 122 -------------------------- 2 files changed, 225 deletions(-) delete mode 100644 src/serinv/utils/trsm_solve_device.py delete mode 100644 src/serinv/utils/trsm_solve_host.py diff --git a/src/serinv/utils/trsm_solve_device.py b/src/serinv/utils/trsm_solve_device.py deleted file mode 100644 index 9d358a1c..00000000 --- a/src/serinv/utils/trsm_solve_device.py +++ /dev/null @@ -1,103 +0,0 @@ -import numpy - -from cupy.cuda import cublas -from cupy.cuda import device -from cupy.linalg import _util - - -def solve_triangular_device(a, b, trans=0, lower=False, unit_diagonal=False, - overwrite_b=False, check_finite=False, aplha = 1., side=0): - """Solve the equation a x = b for x, assuming a is a triangular matrix. - - Args: - a (cupy.ndarray): The matrix with dimension ``(M, M)``. - b (cupy.ndarray): The matrix with dimension ``(M,)`` or - ``(M, N)``. - lower (bool): Use only data contained in the lower triangle of ``a``. - Default is to use upper triangle. - trans (0, 1, 2, 'N', 'T' or 'C'): Type of system to solve: - - - *'0'* or *'N'* -- :math:`a x = b` - - *'1'* or *'T'* -- :math:`a^T x = b` - - *'2'* or *'C'* -- :math:`a^H x = b` - - unit_diagonal (bool): If ``True``, diagonal elements of ``a`` are - assumed to be 1 and will not be referenced. - overwrite_b (bool): Allow overwriting data in b (may enhance - performance) - check_finite (bool): Whether to check that the input matrices contain - only finite numbers. Disabling may give a performance gain, but may - result in problems (crashes, non-termination) if the inputs do - contain infinities or NaNs. - - Returns: - cupy.ndarray: - The matrix with dimension ``(M,)`` or ``(M, N)``. - - .. seealso:: :func:`scipy.linalg.solve_triangular` - """ - - _util._assert_cupy_array(a, b) - - if len(a.shape) != 2 or a.shape[0] != a.shape[1]: - raise ValueError('expected square matrix') - if len(a) != len(b): - raise ValueError('incompatible dimensions') - - # Cast to float32 or float64 - if a.dtype.char in 'fd': - dtype = a.dtype - else: - dtype = numpy.promote_types(a.dtype.char, 'f') - - a = cupy.array(a, dtype=dtype, order='F', copy=False) - b = cupy.array(b, dtype=dtype, order='F', copy=(not overwrite_b)) - - if check_finite: - if a.dtype.kind == 'f' and not cupy.isfinite(a).all(): - raise ValueError( - 'array must not contain infs or NaNs') - if b.dtype.kind == 'f' and not cupy.isfinite(b).all(): - raise ValueError( - 'array must not contain infs or NaNs') - - m, n = (b.size, 1) if b.ndim == 1 else b.shape - cublas_handle = device.get_cublas_handle() - - if dtype == 'f': - trsm = cublas.strsm - elif dtype == 'd': - trsm = cublas.dtrsm - elif dtype == 'F': - trsm = cublas.ctrsm - else: # dtype == 'D' - trsm = cublas.ztrsm - one = numpy.array(1, dtype=dtype) - - if lower: - uplo = cublas.CUBLAS_FILL_MODE_LOWER - else: - uplo = cublas.CUBLAS_FILL_MODE_UPPER - - if trans == 'N': - trans = cublas.CUBLAS_OP_N - elif trans == 'T': - trans = cublas.CUBLAS_OP_T - elif trans == 'C': - trans = cublas.CUBLAS_OP_C - - if unit_diagonal: - diag = cublas.CUBLAS_DIAG_UNIT - else: - diag = cublas.CUBLAS_DIAG_NON_UNIT - - if side: - blas_side = cublas.CUBLAS_SIDE_RIGHT - else: - blas_side = cublas.CUBLAS_SIDE_LEFT - - trsm( - cublas_handle, blas_side, uplo, - trans, diag, - m, n, one.ctypes.data, a.data.ptr, m, b.data.ptr, m) - return b \ No newline at end of file diff --git a/src/serinv/utils/trsm_solve_host.py b/src/serinv/utils/trsm_solve_host.py deleted file mode 100644 index 820770e4..00000000 --- a/src/serinv/utils/trsm_solve_host.py +++ /dev/null @@ -1,122 +0,0 @@ -import numpy as np - - -from scipy.linalg.blas import get_blas_funcs -from scipy.linalg._misc import _datacopied -from scipy.linalg._decomp import _asarray_validated - -def solve_triangular_host(a, b, trans=0, lower=False, unit_diagonal=False, - overwrite_b=False, check_finite=True, side=0): - """ - Solve the equation ``a x = b`` for `x`, assuming a is a triangular matrix. - - Parameters - ---------- - a : (M, M) array_like - A triangular matrix - b : (M,) or (M, N) array_like - Right-hand side matrix in ``a x = b`` - lower : bool, optional - Use only data contained in the lower triangle of `a`. - Default is to use upper triangle. - trans : {0, 1, 2, 'N', 'T', 'C'}, optional - Type of system to solve: - - ======== ========= - trans system - ======== ========= - 0 or 'N' a x = b - 1 or 'T' a^T x = b - 2 or 'C' a^H x = b - ======== ========= - unit_diagonal : bool, optional - If True, diagonal elements of `a` are assumed to be 1 and - will not be referenced. - overwrite_b : bool, optional - Allow overwriting data in `b` (may enhance performance) - check_finite : bool, optional - Whether to check that the input matrices contain only finite numbers. - Disabling may give a performance gain, but may result in problems - (crashes, non-termination) if the inputs do contain infinities or NaNs. - - Returns - ------- - x : (M,) or (M, N) ndarray - Solution to the system ``a x = b``. Shape of return matches `b`. - - Raises - ------ - LinAlgError - If `a` is singular - - Notes - ----- - .. versionadded:: 0.9.0 - - Examples - -------- - Solve the lower triangular system a x = b, where:: - - [3 0 0 0] [4] - a = [2 1 0 0] b = [2] - [1 0 1 0] [4] - [1 1 1 1] [2] - - >>> import numpy as np - >>> from scipy.linalg import solve_triangular - >>> a = np.array([[3, 0, 0, 0], [2, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) - >>> b = np.array([4, 2, 4, 2]) - >>> x = solve_triangular(a, b, lower=True) - >>> x - array([ 1.33333333, -0.66666667, 2.66666667, -1.33333333]) - >>> a.dot(x) # Check the result - array([ 4., 2., 4., 2.]) - - """ - - a1 = _asarray_validated(a, check_finite=check_finite) - b1 = _asarray_validated(b, check_finite=check_finite) - - if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]: - raise ValueError('expected square matrix') - - if a1.shape[0] != b1.shape[0]: - raise ValueError(f'shapes of a {a1.shape} and b {b1.shape} are incompatible') - - # accommodate empty arrays - if b1.size == 0: - dt_nonempty = solve_triangular_host( - np.eye(2, dtype=a1.dtype), np.ones(2, dtype=b1.dtype) - ).dtype - return np.empty_like(b1, dtype=dt_nonempty) - - overwrite_b = overwrite_b or _datacopied(b1, b) - - x = _solve_triangular(a1, b1, trans, lower, unit_diagonal, overwrite_b, side) - return x - - -# solve_triangular without the input validation -def _solve_triangular(a1, b1, trans=0, lower=False, unit_diagonal=False, - overwrite_b=False, side=0): - - trans = {'N': 0, 'T': 1, 'C': 2}.get(trans, trans) - trsm, = get_blas_funcs(('trsm',), (a1, b1)) - - if a1.dtype.char in 'fd': - dtype = a1.dtype - else: - dtype = np.promote_types(a1.dtype.char, 'f') - - one = np.array(1, dtype=dtype) - alpha = one.ctypes.data - - if a1.flags.f_contiguous or trans == 2: - x = trsm(alpha, a1, b1, overwrite_b=overwrite_b, lower=lower, - trans_a=trans, diag=unit_diagonal, side=side) - else: - # transposed system is solved since trtrs expects Fortran ordering - x = trsm(alpha, a1.T, b1, overwrite_b=overwrite_b, lower=not lower, - trans_a=not trans, diag=unit_diagonal, side=side) - - return x From 96fb56beb604c77a209c79095baaebe439b5fa7f Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Thu, 5 Jun 2025 15:18:36 +0200 Subject: [PATCH 240/242] unified (and added) test streaming for pobtaf/si --- tests/conftest.py | 3 --- tests/tests_algs/regular/tests_bt/test_pobtf.py | 12 ++++++++++++ tests/tests_algs/regular/tests_bt/test_pobts.py | 16 ++++++++-------- tests/tests_algs/regular/tests_bt/test_pobtsi.py | 12 ++++++++++++ 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3e624933..4d15c7db 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ # Copyright 2023-2025 ETH Zurich. All rights reserved. # Global pytest fixtures for the Serinv tests. - import pytest from serinv import backend_flags @@ -15,7 +14,6 @@ ] ) - DTYPE = [ pytest.param("float64", id="float64"), pytest.param("complex128", id="complex128"), @@ -26,7 +24,6 @@ pytest.param(3, id="diagonal_blocksize=3"), ] - @pytest.fixture(params=ARRAY_TYPE, autouse=True) def array_type(request: pytest.FixtureRequest) -> str: return request.param diff --git a/tests/tests_algs/regular/tests_bt/test_pobtf.py b/tests/tests_algs/regular/tests_bt/test_pobtf.py index d1969b05..0ac3ae89 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobtf.py +++ b/tests/tests_algs/regular/tests_bt/test_pobtf.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from ....conftest import ARRAY_TYPE + from serinv import backend_flags, _get_module_from_array from ....testing_utils import bt_dense_to_arrays, dd_bt, symmetrize @@ -11,6 +13,16 @@ if backend_flags["cupy_avail"]: import cupyx as cpx + ARRAY_TYPE.extend( + [ + pytest.param("streaming", id="streaming"), + ] + ) + + @pytest.fixture(params=ARRAY_TYPE, autouse=True) + def array_type(request: pytest.FixtureRequest) -> str: + return request.param + @pytest.mark.mpi_skip() def test_pobtf( diff --git a/tests/tests_algs/regular/tests_bt/test_pobts.py b/tests/tests_algs/regular/tests_bt/test_pobts.py index d137c796..9011caa1 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobts.py +++ b/tests/tests_algs/regular/tests_bt/test_pobts.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from ....conftest import ARRAY_TYPE as ARRAY_TYPE +from ....conftest import ARRAY_TYPE from serinv import backend_flags, _get_module_from_array from ....testing_utils import bt_dense_to_arrays, dd_bt, symmetrize, rhs @@ -11,20 +11,19 @@ from serinv.algs import pobtf, pobts if backend_flags["cupy_avail"]: + import cupyx as cpx + ARRAY_TYPE.extend( [ pytest.param("streaming", id="streaming"), ] ) -if backend_flags["cupy_avail"]: - import cupyx as cpx + @pytest.fixture(params=ARRAY_TYPE, autouse=True) + def array_type(request: pytest.FixtureRequest) -> str: + return request.param -@pytest.fixture(params=ARRAY_TYPE, autouse=True) -def array_type(request: pytest.FixtureRequest) -> str: - return request.param - @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobts( @@ -34,7 +33,7 @@ def test_pobts( array_type: str, dtype: np.dtype, ): - + A = dd_bt( diagonal_blocksize, n_diag_blocks, @@ -79,6 +78,7 @@ def test_pobts( pobtf( A_diagonal_blocks, A_lower_diagonal_blocks, + device_streaming=True if array_type == "streaming" else False, ) # Forward solve: Y=L^{-1}B diff --git a/tests/tests_algs/regular/tests_bt/test_pobtsi.py b/tests/tests_algs/regular/tests_bt/test_pobtsi.py index 22ec1d3a..5463a7b9 100644 --- a/tests/tests_algs/regular/tests_bt/test_pobtsi.py +++ b/tests/tests_algs/regular/tests_bt/test_pobtsi.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from ....conftest import ARRAY_TYPE + from serinv import backend_flags, _get_module_from_array from ....testing_utils import bt_dense_to_arrays, dd_bt, symmetrize @@ -11,6 +13,16 @@ if backend_flags["cupy_avail"]: import cupyx as cpx + ARRAY_TYPE.extend( + [ + pytest.param("streaming", id="streaming"), + ] + ) + + @pytest.fixture(params=ARRAY_TYPE, autouse=True) + def array_type(request: pytest.FixtureRequest) -> str: + return request.param + @pytest.mark.mpi_skip() def test_pobtsi( From 33ba4677c15ad94b6759b47ea56917a6275216c2 Mon Sep 17 00:00:00 2001 From: vincent-maillou Date: Thu, 5 Jun 2025 15:24:18 +0200 Subject: [PATCH 241/242] just ran `black .` --- src/serinv/algs/pobtas.py | 230 ++++++++++-------- src/serinv/algs/pobts.py | 130 +++++----- src/serinv/wrappers/ddbtars.py | 1 - src/serinv/wrappers/pddbtasc.py | 2 +- src/serinv/wrappers/pddbtasci.py | 2 +- src/serinv/wrappers/pddbtsc.py | 4 +- src/serinv/wrappers/pddbtsci.py | 2 +- tests/conftest.py | 1 + .../permuted/test_bt/test_pobts_permuted.py | 4 +- .../regular/tests_bta/test_pobtaf.py | 1 + .../regular/tests_bta/test_pobtas.py | 3 +- 11 files changed, 207 insertions(+), 173 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 1575a4ac..cc51e3bb 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -223,7 +223,8 @@ def _pobtas_permuted( ) else: raise ValueError(f"Invalid transpose argument: {trans}.") - + + def _pobtas_streaming( L_diagonal_blocks: ArrayLike, L_lower_diagonal_blocks: ArrayLike, @@ -238,8 +239,6 @@ def _pobtas_streaming( raise NotImplementedError( "Host<->Device streaming only works when host-arrays are given." ) - - cp, cu_la = _get_module_from_str(module_str="cupy") @@ -253,18 +252,13 @@ def _pobtas_streaming( h2d_stream = cp.cuda.Stream(non_blocking=True) d2h_stream = cp.cuda.Stream(non_blocking=True) - - # Device Buffers # B Buffers - B_shape = B[-arrow_blocksize:] # block template + B_shape = B[-arrow_blocksize:] # block template B_arrow_tip_d = cp.empty_like(B_shape) - B_shape = B[0 : diag_blocksize] - B_d = cp.empty( - (2, *B_shape.shape), dtype=B_shape.dtype - ) - + B_shape = B[0:diag_blocksize] + B_d = cp.empty((2, *B_shape.shape), dtype=B_shape.dtype) # L Buffers L_diagonal_blocks_d = cp.empty( @@ -307,9 +301,9 @@ def _pobtas_streaming( L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) # --- H2D: transfers --- - B_d[0].set(arr=B[0 : diag_blocksize], stream = h2d_stream) + B_d[0].set(arr=B[0:diag_blocksize], stream=h2d_stream) h2d_B_events[0].record(stream=h2d_stream) - + L_diagonal_blocks_d[0].set(arr=L_diagonal_blocks[0], stream=h2d_stream) h2d_diagonal_events[0].record(stream=h2d_stream) @@ -318,39 +312,36 @@ def _pobtas_streaming( # --- D2H: event --- d2h_B_events[1].record(stream=d2h_stream) - + n_diag_blocks: int = L_diagonal_blocks.shape[0] if n_diag_blocks > 1: - L_lower_diagonal_blocks_d[0].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + L_lower_diagonal_blocks_d[0].set( + arr=L_lower_diagonal_blocks[0], stream=h2d_stream + ) h2d_lower_diagonal_events[0].record(stream=h2d_stream) - # --- Computations --- for i in range(0, n_diag_blocks - 1): # pass next B block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2].set( arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], - stream = h2d_stream + stream=h2d_stream, ) h2d_B_events[(i + 1) % 2].record(stream=h2d_stream) - if i + 1 < n_diag_blocks - 1: # pass next diagonal block h2d_stream.wait_event(compute_current_B_events[(i + 1) % 2]) L_diagonal_blocks_d[(i + 1) % 2].set( - arr=L_diagonal_blocks[i + 1], - stream=h2d_stream + arr=L_diagonal_blocks[i + 1], stream=h2d_stream ) - - h2d_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) + h2d_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: # Solve current B block @@ -363,22 +354,20 @@ def _pobtas_streaming( ) compute_current_B_events[i % 2].record(stream=compute_stream) - + # Pass current B block back - if i + 1 < n_diag_blocks - 1: # Pass next lower diagonal block h2d_stream.wait_event(compute_next_B_events[(i + 1) % 2]) L_lower_diagonal_blocks_d[(i + 1) % 2].set( - arr=L_lower_diagonal_blocks[i + 1], - stream=h2d_stream + arr=L_lower_diagonal_blocks[i + 1], stream=h2d_stream ) - + h2d_lower_diagonal_events[(i + 1) % 2].record(stream=h2d_stream) d2h_stream.wait_event(compute_current_B_events[i % 2]) - d2h_stream.wait_event(h2d_lower_diagonal_events[(i+1) % 2]) + d2h_stream.wait_event(h2d_lower_diagonal_events[(i + 1) % 2]) B_d[i % 2].get( out=B[i * diag_blocksize : (i + 1) * diag_blocksize], @@ -387,131 +376,145 @@ def _pobtas_streaming( ) d2h_B_events[i % 2].record(stream=d2h_stream) - + with compute_stream: # Update next B block compute_stream.wait_event(h2d_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2] -= ( - L_lower_diagonal_blocks_d[i % 2] - @ B_d[i % 2] - ) + B_d[(i + 1) % 2] -= L_lower_diagonal_blocks_d[i % 2] @ B_d[i % 2] + + compute_next_B_events[i % 2].record(stream=compute_stream) - compute_next_B_events[i % 2].record(stream=compute_stream) - if i + 1 < n_diag_blocks - 1: # Pass next lower arrow block h2d_stream.wait_event(compute_arrow_B_events[(i + 1) % 2]) L_lower_arrow_blocks_d[(i + 1) % 2].set( - arr=L_lower_arrow_blocks[i + 1], - stream=h2d_stream + arr=L_lower_arrow_blocks[i + 1], stream=h2d_stream ) - + h2d_arrow_events[(i + 1) % 2].record(stream=h2d_stream) with compute_stream: # Update arrow tip compute_stream.wait_event(h2d_arrow_events[i % 2]) - - B_arrow_tip_d -= ( - L_lower_arrow_blocks_d[i % 2] - @ B_d[i % 2] - ) + + B_arrow_tip_d -= L_lower_arrow_blocks_d[i % 2] @ B_d[i % 2] compute_arrow_B_events[i % 2].record(stream=compute_stream) # Pass arrow tip back d2h_stream.wait_event(compute_arrow_B_events[n_diag_blocks % 2]) - - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - d2h_tip_events[n_diag_blocks % 2].record(stream=d2h_stream) + B_arrow_tip_d.get( + out=B[-arrow_blocksize:], + stream=d2h_stream, + blocking=False, + ) + d2h_tip_events[n_diag_blocks % 2].record(stream=d2h_stream) if not partial: # Pass last blocks h2d_stream.wait_event(d2h_tip_events[n_diag_blocks % 2]) - L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream) - + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set( + arr=L_diagonal_blocks[n_diag_blocks - 1], stream=h2d_stream + ) + h2d_diagonal_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) - L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) - + L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set( + arr=L_lower_arrow_blocks[-1], stream=h2d_stream + ) + h2d_arrow_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) - with compute_stream: # Solve last B block compute_stream.wait_event(h2d_diagonal_events[(n_diag_blocks - 1) % 2]) B_d[(n_diag_blocks - 1) % 2] = cu_la.solve_triangular( - L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], - B_d[(n_diag_blocks - 1) % 2], - lower=True + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], + B_d[(n_diag_blocks - 1) % 2], + lower=True, ) - + compute_partial_events[0].record(stream=compute_stream) # Pass last B block back d2h_stream.wait_event(compute_partial_events[0]) B_d[(n_diag_blocks - 1) % 2].get( - out=B[(n_diag_blocks - 1) * diag_blocksize : n_diag_blocks * diag_blocksize], - stream=d2h_stream, - blocking=False + out=B[ + (n_diag_blocks - 1) + * diag_blocksize : n_diag_blocks + * diag_blocksize + ], + stream=d2h_stream, + blocking=False, ) - + d2h_B_events[0].record(stream=d2h_stream) with compute_stream: # Solve arrow tip compute_stream.wait_event(h2d_arrow_events[(n_diag_blocks - 1) % 2]) - B_arrow_tip_d -= (L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2] @ B_d[(n_diag_blocks - 1) % 2]) - B_arrow_tip_d = cu_la.solve_triangular(L_arrow_tip_block_d, B_arrow_tip_d, lower=True) + B_arrow_tip_d -= ( + L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2] + @ B_d[(n_diag_blocks - 1) % 2] + ) + B_arrow_tip_d = cu_la.solve_triangular( + L_arrow_tip_block_d, B_arrow_tip_d, lower=True + ) compute_partial_events[1].record(stream=compute_stream) d2h_stream.wait_event(compute_partial_events[1]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) + B_arrow_tip_d.get( + out=B[-arrow_blocksize:], + stream=d2h_stream, + blocking=False, + ) elif trans == "T" or trans == "C": # ----- Backward substitution ----- # Buffers - B_previous_d = cp.empty( - (2, *B_shape.shape), dtype=B_shape.dtype - ) + B_previous_d = cp.empty((2, *B_shape.shape), dtype=B_shape.dtype) # Delete helper variable del B_shape - + # Events compute_B_events = [cp.cuda.Event(), cp.cuda.Event()] h2d_events = [cp.cuda.Event(), cp.cuda.Event()] d2h_events = [cp.cuda.Event(), cp.cuda.Event()] - + # --- H2D: transfers --- B_arrow_tip_d.set(arr=B[-arrow_blocksize:], stream=h2d_stream) L_arrow_tip_block_d.set(arr=L_arrow_tip_block[:], stream=h2d_stream) B_d[(n_diag_blocks - 1) % 2].set( - arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], - stream=h2d_stream + arr=B[-arrow_blocksize - diag_blocksize : -arrow_blocksize], + stream=h2d_stream, + ) + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set( + arr=L_diagonal_blocks[-1], stream=h2d_stream + ) + L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set( + arr=L_lower_arrow_blocks[-1], stream=h2d_stream ) - L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) - L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_lower_arrow_blocks[-1], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) - + # ----- Backward substitution ----- if not partial: - + with compute_stream: # X_{ndb+1} = L_{ndb+1,ndb+1}^{-T} (Y_{ndb+1}) compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) - B_arrow_tip_d = cu_la.solve_triangular( + B_arrow_tip_d = cu_la.solve_triangular( L_arrow_tip_block_d, B_arrow_tip_d, lower=True, @@ -519,14 +522,13 @@ def _pobtas_streaming( ) # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) - B_previous_d[(n_diag_blocks - 1) % 2] = ( - cu_la.solve_triangular( - L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], - B_d[(n_diag_blocks - 1) % 2] - - L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].conj().T @ B_arrow_tip_d, - lower=True, - trans="C", - ) + B_previous_d[(n_diag_blocks - 1) % 2] = cu_la.solve_triangular( + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], + B_d[(n_diag_blocks - 1) % 2] + - L_lower_arrow_blocks_d[(n_diag_blocks - 1) % 2].conj().T + @ B_arrow_tip_d, + lower=True, + trans="C", ) compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) @@ -534,39 +536,61 @@ def _pobtas_streaming( # Pass arrow tip back d2h_stream.wait_event(compute_B_events[(n_diag_blocks - 1) % 2]) - B_arrow_tip_d.get(out=B[-arrow_blocksize:], stream=d2h_stream, blocking=False,) - + B_arrow_tip_d.get( + out=B[-arrow_blocksize:], + stream=d2h_stream, + blocking=False, + ) if n_diag_blocks > 1: B_d[n_diag_blocks % 2].set( - arr=B[-arrow_blocksize - (2 * diag_blocksize) : -arrow_blocksize - diag_blocksize], - stream=h2d_stream + arr=B[ + -arrow_blocksize + - (2 * diag_blocksize) : -arrow_blocksize + - diag_blocksize + ], + stream=h2d_stream, + ) + L_diagonal_blocks_d[n_diag_blocks % 2].set( + arr=L_diagonal_blocks[-2], stream=h2d_stream ) - L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) - L_lower_arrow_blocks_d[n_diag_blocks % 2].set(arr=L_lower_arrow_blocks[-2], stream=h2d_stream) - L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) - + L_lower_arrow_blocks_d[n_diag_blocks % 2].set( + arr=L_lower_arrow_blocks[-2], stream=h2d_stream + ) + L_lower_diagonal_blocks_d[n_diag_blocks % 2].set( + arr=L_lower_diagonal_blocks[-1], stream=h2d_stream + ) + h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) for i in range(n_diag_blocks - 2, -1, -1): - + if i > 0: # Pass new blocks h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) - L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) - L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) - L_lower_arrow_blocks_d[(i - 1) % 2].set(arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream) + B_d[(i - 1) % 2].set( + arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], + stream=h2d_stream, + ) + L_diagonal_blocks_d[(i - 1) % 2].set( + arr=L_diagonal_blocks[i - 1], stream=h2d_stream + ) + L_lower_diagonal_blocks_d[(i - 1) % 2].set( + arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream + ) + L_lower_arrow_blocks_d[(i - 1) % 2].set( + arr=L_lower_arrow_blocks[i - 1], stream=h2d_stream + ) h2d_events[i % 2].record(stream=h2d_stream) - + with compute_stream: # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} compute_stream.wait_event(h2d_events[(i - 1) % 2]) compute_stream.wait_event(d2h_events[(i - 1) % 2]) - + B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] @@ -583,10 +607,9 @@ def _pobtas_streaming( d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) B_previous_d[(i - 1) % 2].get( - out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], - stream=d2h_stream, - blocking=False - + out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream=d2h_stream, + blocking=False, ) d2h_events[i % 2].record(stream=d2h_stream) @@ -597,6 +620,5 @@ def _pobtas_streaming( else: raise ValueError(f"Invalid transpose argument: {trans}.") - - cp.cuda.Device().synchronize() \ No newline at end of file + cp.cuda.Device().synchronize() diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 295be756..dc570116 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -181,13 +181,9 @@ def _pobts_streaming( # Device Buffers # B Buffers - B_shape = B[0 : diag_blocksize] - B_d = cp.empty( - (2, *B_shape.shape), dtype=B_shape.dtype - ) - B_previous_d = cp.empty( - (2, *B_shape.shape), dtype=B_shape.dtype - ) + B_shape = B[0:diag_blocksize] + B_d = cp.empty((2, *B_shape.shape), dtype=B_shape.dtype) + B_previous_d = cp.empty((2, *B_shape.shape), dtype=B_shape.dtype) del B_shape # L Buffers @@ -213,12 +209,11 @@ def _pobts_streaming( h2d_events[1].record(stream=h2d_stream) if n_diag_blocks > 1: - B_d[1].set( - arr=B[diag_blocksize : (2 * diag_blocksize)], - stream=h2d_stream - ) + B_d[1].set(arr=B[diag_blocksize : (2 * diag_blocksize)], stream=h2d_stream) L_diagonal_blocks_d[1].set(arr=L_diagonal_blocks[1], stream=h2d_stream) - L_lower_diagonal_blocks_d[1].set(arr=L_lower_diagonal_blocks[0], stream=h2d_stream) + L_lower_diagonal_blocks_d[1].set( + arr=L_lower_diagonal_blocks[0], stream=h2d_stream + ) h2d_events[0].record(stream=h2d_stream) @@ -226,28 +221,33 @@ def _pobts_streaming( # Solve first B block compute_stream.wait_event(h2d_events[1]) - B_previous_d[0] = ( - cu_la.solve_triangular( - L_diagonal_blocks_d[0], - B_d[0], - lower=True, - ) + B_previous_d[0] = cu_la.solve_triangular( + L_diagonal_blocks_d[0], + B_d[0], + lower=True, ) compute_B_events[0].record(stream=compute_stream) for i in range(1, n_diag_blocks): - + if i + 1 < n_diag_blocks: # Pass next blocks h2d_stream.wait_event(compute_B_events[(i + 1) % 2]) - B_d[(i + 1) % 2].set(arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=h2d_stream) - L_diagonal_blocks_d[(i + 1) % 2].set(arr=L_diagonal_blocks[i + 1], stream=h2d_stream) - L_lower_diagonal_blocks_d[(i + 1) % 2].set(arr=L_lower_diagonal_blocks[i], stream=h2d_stream) - + B_d[(i + 1) % 2].set( + arr=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream=h2d_stream, + ) + L_diagonal_blocks_d[(i + 1) % 2].set( + arr=L_diagonal_blocks[i + 1], stream=h2d_stream + ) + L_lower_diagonal_blocks_d[(i + 1) % 2].set( + arr=L_lower_diagonal_blocks[i], stream=h2d_stream + ) + h2d_events[i % 2].record(stream=h2d_stream) - + with compute_stream: # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} compute_stream.wait_event(h2d_events[(i + 1) % 2]) @@ -256,8 +256,7 @@ def _pobts_streaming( B_previous_d[i % 2] = cu_la.solve_triangular( L_diagonal_blocks_d[i % 2], B_d[i % 2] - - L_lower_diagonal_blocks_d[i % 2] - @ B_previous_d[(i + 1) % 2], + - L_lower_diagonal_blocks_d[i % 2] @ B_previous_d[(i + 1) % 2], lower=True, ) @@ -265,38 +264,44 @@ def _pobts_streaming( # Pass previous B block back d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - + B_previous_d[(i + 1) % 2].get( - out=B[(i - 1) * diag_blocksize : i * diag_blocksize], - stream=d2h_stream, - blocking=False + out=B[(i - 1) * diag_blocksize : i * diag_blocksize], + stream=d2h_stream, + blocking=False, ) - + d2h_events[i % 2].record(stream=d2h_stream) # Pass last B block back d2h_stream.wait_event(compute_B_events[(n_diag_blocks + 1) % 2]) - - B_previous_d[(n_diag_blocks + 1) % 2].get(out=B[-diag_blocksize:], stream=d2h_stream, blocking=False) - - + + B_previous_d[(n_diag_blocks + 1) % 2].get( + out=B[-diag_blocksize:], stream=d2h_stream, blocking=False + ) + elif trans == "T" or trans == "C": # ----- Backward substitution ----- # --- H2D: transfers --- B_d[(n_diag_blocks - 1) % 2].set(arr=B[-diag_blocksize:], stream=h2d_stream) - L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set(arr=L_diagonal_blocks[-1], stream=h2d_stream) + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2].set( + arr=L_diagonal_blocks[-1], stream=h2d_stream + ) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) if n_diag_blocks > 1: B_d[n_diag_blocks % 2].set( - arr=B[-(2 * diag_blocksize) : -diag_blocksize], - stream=h2d_stream + arr=B[-(2 * diag_blocksize) : -diag_blocksize], stream=h2d_stream + ) + L_diagonal_blocks_d[n_diag_blocks % 2].set( + arr=L_diagonal_blocks[-2], stream=h2d_stream + ) + L_lower_diagonal_blocks_d[n_diag_blocks % 2].set( + arr=L_lower_diagonal_blocks[-1], stream=h2d_stream ) - L_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_diagonal_blocks[-2], stream=h2d_stream) - L_lower_diagonal_blocks_d[n_diag_blocks % 2].set(arr=L_lower_diagonal_blocks[-1], stream=h2d_stream) h2d_events[(n_diag_blocks - 1) % 2].record(stream=h2d_stream) @@ -304,31 +309,34 @@ def _pobts_streaming( # X_{ndb} = L_{ndb,ndb}^{-T} (Y_{ndb} - L_{ndb+1,ndb}^{T} X_{ndb+1}) compute_stream.wait_event(h2d_events[(n_diag_blocks - 1) % 2]) - B_previous_d[(n_diag_blocks - 1) % 2] = ( - cu_la.solve_triangular( - L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], - B_d[(n_diag_blocks - 1) % 2], - lower=True, - trans="C", - ) + B_previous_d[(n_diag_blocks - 1) % 2] = cu_la.solve_triangular( + L_diagonal_blocks_d[(n_diag_blocks - 1) % 2], + B_d[(n_diag_blocks - 1) % 2], + lower=True, + trans="C", ) compute_B_events[(n_diag_blocks - 1) % 2].record(stream=compute_stream) - - for i in range(n_diag_blocks - 2, -1, -1): - + if i > 0: # pass next blocks h2d_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_d[(i - 1) % 2].set(arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], stream=h2d_stream) - L_diagonal_blocks_d[(i - 1) % 2].set(arr=L_diagonal_blocks[i - 1], stream=h2d_stream) - L_lower_diagonal_blocks_d[(i - 1) % 2].set(arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream) + B_d[(i - 1) % 2].set( + arr=B[(i - 1) * diag_blocksize : i * diag_blocksize], + stream=h2d_stream, + ) + L_diagonal_blocks_d[(i - 1) % 2].set( + arr=L_diagonal_blocks[i - 1], stream=h2d_stream + ) + L_lower_diagonal_blocks_d[(i - 1) % 2].set( + arr=L_lower_diagonal_blocks[i - 1], stream=h2d_stream + ) h2d_events[i % 2].record(stream=h2d_stream) - + with compute_stream: # X_{i} = L_{i,i}^{-T} (Y_{i} - L_{i+1,i}^{T} X_{i+1}) - L_{ndb+1,i}^T X_{ndb+1} compute_stream.wait_event(h2d_events[(i - 1) % 2]) @@ -348,16 +356,20 @@ def _pobts_streaming( # Pass previous B block back d2h_stream.wait_event(compute_B_events[(i - 1) % 2]) - B_previous_d[(i - 1) % 2].get(out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], stream=d2h_stream, blocking=False) - + B_previous_d[(i - 1) % 2].get( + out=B[(i + 1) * diag_blocksize : (i + 2) * diag_blocksize], + stream=d2h_stream, + blocking=False, + ) + d2h_events[i % 2].record(stream=d2h_stream) # Pass last B block back d2h_stream.wait_event(compute_B_events[0]) - + B_previous_d[0].get(out=B[:diag_blocksize], stream=d2h_stream, blocking=False) else: raise ValueError(f"Invalid transpose argument: {trans}.") - - cp.cuda.Device().synchronize() \ No newline at end of file + + cp.cuda.Device().synchronize() diff --git a/src/serinv/wrappers/ddbtars.py b/src/serinv/wrappers/ddbtars.py index beb34390..d836d25a 100644 --- a/src/serinv/wrappers/ddbtars.py +++ b/src/serinv/wrappers/ddbtars.py @@ -13,7 +13,6 @@ import cupyx as cpx - def allocate_ddbtars( A_diagonal_blocks: ArrayLike, A_lower_diagonal_blocks: ArrayLike, diff --git a/src/serinv/wrappers/pddbtasc.py b/src/serinv/wrappers/pddbtasc.py index 3b9a4d4a..da2fcff4 100644 --- a/src/serinv/wrappers/pddbtasc.py +++ b/src/serinv/wrappers/pddbtasc.py @@ -42,7 +42,7 @@ def pddbtasc( The arrow tip block of the block tridiagonal with arrowhead matrix. comm : MPI.Comm The MPI communicator. Default is MPI.COMM_WORLD. - + Keyword Arguments ----------------- rhs : dict diff --git a/src/serinv/wrappers/pddbtasci.py b/src/serinv/wrappers/pddbtasci.py index f86b6a9c..0ed92861 100644 --- a/src/serinv/wrappers/pddbtasci.py +++ b/src/serinv/wrappers/pddbtasci.py @@ -43,7 +43,7 @@ def pddbtasci( The arrow tip block of the block tridiagonal with arrowhead matrix. comm : MPI.Comm The MPI communicator. Default is MPI.COMM_WORLD. - + Keyword Arguments ----------------- rhs : dict diff --git a/src/serinv/wrappers/pddbtsc.py b/src/serinv/wrappers/pddbtsc.py index 357b08f6..fc0a3765 100644 --- a/src/serinv/wrappers/pddbtsc.py +++ b/src/serinv/wrappers/pddbtsc.py @@ -33,7 +33,7 @@ def pddbtsc( The upper diagonal blocks of the block tridiagonal with arrowhead matrix. comm : MPI.Comm The MPI communicator. Default is MPI.COMM_WORLD. - + Keyword Arguments ----------------- rhs : dict @@ -179,4 +179,4 @@ def pddbtsc( quadratic=quadratic, ) - comm.Barrier() \ No newline at end of file + comm.Barrier() diff --git a/src/serinv/wrappers/pddbtsci.py b/src/serinv/wrappers/pddbtsci.py index 144054f8..9c494e13 100644 --- a/src/serinv/wrappers/pddbtsci.py +++ b/src/serinv/wrappers/pddbtsci.py @@ -34,7 +34,7 @@ def pddbtsci( The upper diagonal blocks of the block tridiagonal matrix. comm : MPI.Comm The MPI communicator. Default is MPI.COMM_WORLD. - + Keyword Arguments ----------------- rhs : dict diff --git a/tests/conftest.py b/tests/conftest.py index 4d15c7db..25b716a6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,7 @@ pytest.param(3, id="diagonal_blocksize=3"), ] + @pytest.fixture(params=ARRAY_TYPE, autouse=True) def array_type(request: pytest.FixtureRequest) -> str: return request.param diff --git a/tests/tests_algs/permuted/test_bt/test_pobts_permuted.py b/tests/tests_algs/permuted/test_bt/test_pobts_permuted.py index bba1068a..7a934c61 100644 --- a/tests/tests_algs/permuted/test_bt/test_pobts_permuted.py +++ b/tests/tests_algs/permuted/test_bt/test_pobts_permuted.py @@ -44,9 +44,7 @@ def test_pobts_permuted( A_diagonal_blocks, A_lower_diagonal_blocks, _, - ) = bt_dense_to_arrays( - A.copy(), diagonal_blocksize, n_diag_blocks - ) + ) = bt_dense_to_arrays(A.copy(), diagonal_blocksize, n_diag_blocks) # Allocate permutation buffer buffer = allocate_pobtx_permutation_buffers( diff --git a/tests/tests_algs/regular/tests_bta/test_pobtaf.py b/tests/tests_algs/regular/tests_bta/test_pobtaf.py index 98756357..920c6292 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtaf.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtaf.py @@ -25,6 +25,7 @@ def array_type(request: pytest.FixtureRequest) -> str: return request.param + @pytest.mark.mpi_skip() def test_pobtaf( diagonal_blocksize: int, diff --git a/tests/tests_algs/regular/tests_bta/test_pobtas.py b/tests/tests_algs/regular/tests_bta/test_pobtas.py index d58e3a19..ffc290c2 100644 --- a/tests/tests_algs/regular/tests_bta/test_pobtas.py +++ b/tests/tests_algs/regular/tests_bta/test_pobtas.py @@ -25,6 +25,7 @@ def array_type(request: pytest.FixtureRequest) -> str: return request.param + @pytest.mark.mpi_skip() @pytest.mark.parametrize("n_rhs", [1, 2, 3]) def test_pobtas( @@ -35,7 +36,7 @@ def test_pobtas( array_type: str, dtype: np.dtype, ): - + A = dd_bta( diagonal_blocksize, arrowhead_blocksize, From 6ff4ed716dce6852034466a3861f784d8f49fb8b Mon Sep 17 00:00:00 2001 From: 03szust Date: Fri, 6 Jun 2025 09:04:02 +0000 Subject: [PATCH 242/242] changed errors --- src/serinv/algs/pobtas.py | 2 +- src/serinv/algs/pobts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serinv/algs/pobtas.py b/src/serinv/algs/pobtas.py index 1575a4ac..60d2b97d 100644 --- a/src/serinv/algs/pobtas.py +++ b/src/serinv/algs/pobtas.py @@ -235,7 +235,7 @@ def _pobtas_streaming( ): arr_module, _ = _get_module_from_array(arr=L_diagonal_blocks) if arr_module.__name__ != "numpy": - raise NotImplementedError( + raise TypeError( "Host<->Device streaming only works when host-arrays are given." ) diff --git a/src/serinv/algs/pobts.py b/src/serinv/algs/pobts.py index 295be756..d065b160 100644 --- a/src/serinv/algs/pobts.py +++ b/src/serinv/algs/pobts.py @@ -164,7 +164,7 @@ def _pobts_streaming( ): arr_module, _ = _get_module_from_array(arr=L_diagonal_blocks) if arr_module.__name__ != "numpy": - raise NotImplementedError( + raise TypeError( "Host<->Device streaming only works when host-arrays are given." )