From 405bb91bff72ede371e781bf74b48e5ad9489575 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 18:25:28 +0100 Subject: [PATCH 01/10] add rocsparse spgemm --- include/spblas/vendor/rocsparse/multiply.hpp | 2 +- .../vendor/rocsparse/multiply_spgemm.hpp | 326 ++++++++++++++++++ include/spblas/vendor/rocsparse/rocsparse.hpp | 1 + 3 files changed, 328 insertions(+), 1 deletion(-) create mode 100644 include/spblas/vendor/rocsparse/multiply_spgemm.hpp diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp index f276fbf..7d88c5c 100644 --- a/include/spblas/vendor/rocsparse/multiply.hpp +++ b/include/spblas/vendor/rocsparse/multiply.hpp @@ -86,7 +86,7 @@ class spmv_state_t { // only allocate the new workspace when the requiring workspace larger than // current if (buffer_size > this->buffer_size_) { - this->alloc_.deallocate(this->workspace_, buffer_size_); + this->alloc_.deallocate(this->workspace_, this->buffer_size_); this->buffer_size_ = buffer_size; this->workspace_ = this->alloc_.allocate(buffer_size); } diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp new file mode 100644 index 0000000..7f215df --- /dev/null +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -0,0 +1,326 @@ +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +#include "exception.hpp" +#include "hip_allocator.hpp" +#include "types.hpp" + +namespace spblas { +namespace __rocsparse { + +template +T create_null_matrix() { + return {nullptr, nullptr, nullptr, index{0, 0}, 0}; +} + +} // namespace __rocsparse + +class spgemm_state_t { +public: + spgemm_state_t() : spgemm_state_t(rocsparse::hip_allocator{}) {} + + spgemm_state_t(rocsparse::hip_allocator alloc) + : alloc_(alloc), buffer_size_(0), workspace_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + rocsparse_handle handle; + __rocsparse::throw_if_error(rocsparse_create_handle(&handle)); + if (auto stream = alloc.stream()) { + rocsparse_set_stream(handle, stream); + } + handle_ = handle_manager(handle, [](rocsparse_handle handle) { + __rocsparse::throw_if_error(rocsparse_destroy_handle(handle)); + }); + } + + spgemm_state_t(rocsparse::hip_allocator alloc, rocsparse_handle handle) + : alloc_(alloc), buffer_size_(0), workspace_(nullptr), result_nnz_(0), + result_shape_(0, 0) { + handle_ = handle_manager(handle, [](rocsparse_handle handle) { + // it is provided by user, we do not delete it at all. + }); + } + + ~spgemm_state_t() { + alloc_.deallocate(this->workspace_, this->buffer_size_); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_a_)); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_b_)); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_c_)); + __rocsparse::throw_if_error(rocsparse_destroy_spmat_descr(this->mat_d_)); + } + + auto result_shape() { + return this->result_shape_; + } + + auto result_nnz() { + return this->result_nnz_; + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_compute(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + auto d_base = __detail::get_ultimate_base(d); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + size_t buffer_size = 0; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + auto handle = this->handle_.get(); + // Create sparse matrix A in CSR format + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1], + a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), + a_base.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, to_rocsparse_datatype())); + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1], + b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(), + b_base.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_c_, __backend::shape(a_base)[0], __backend::shape(b_base)[1], + 0, c.rowptr().data(), NULL, NULL, + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + __rocsparse::throw_if_error(rocsparse_create_csr_descr( + &this->mat_d_, __backend::shape(d_base)[0], __backend::shape(d_base)[1], + d_base.values().size(), d_base.rowptr().data(), d_base.colind().data(), + d_base.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + //-------------------------------------------------------------------------- + // SpGEMM Computation + // ask buffer_size bytes for external memory + __rocsparse::throw_if_error(rocsparse_spgemm( + handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, + this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr)); + if (buffer_size > this->buffer_size_) { + this->alloc_.deallocate(workspace_, this->buffer_size_); + this->buffer_size_ = buffer_size; + workspace_ = this->alloc_.allocate(buffer_size); + } + __rocsparse::throw_if_error(rocsparse_spgemm( + handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, + this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_nnz, &this->buffer_size_, this->workspace_)); + // get matrix C non-zero entries and size + int64_t c_num_rows; + int64_t c_num_cols; + __rocsparse::throw_if_error(rocsparse_spmat_get_size( + this->mat_c_, &c_num_rows, &c_num_cols, &this->result_nnz_)); + this->result_shape_ = index(c_num_rows, c_num_cols); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_fill(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + value_type alpha_val = alpha; + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + __rocsparse::throw_if_error(rocsparse_spgemm( + handle_.get(), rocsparse_operation_none, rocsparse_operation_none, + &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_compute, &this->buffer_size_, workspace_)); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_symbolic_fill(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + auto d_base = __detail::get_ultimate_base(d); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + value_type alpha = alpha_optional.value_or(1); + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + + __rocsparse::throw_if_error(rocsparse_spgemm( + this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, + &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_symbolic, &this->buffer_size_, + this->workspace_)); + } + + template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base + void multiply_numeric(A&& a, B&& b, C&& c, D&& d) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(b); + using matrix_type = decltype(a_base); + using input_type = decltype(b_base); + using output_type = std::remove_reference_t; + using value_type = typename matrix_type::scalar_type; + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + value_type alpha_val = alpha; + auto beta_optional = __detail::get_scaling_factor(d); + value_type beta = beta_optional.value_or(1); + + __rocsparse::throw_if_error(rocsparse_spgemm( + this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, + &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, + to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + rocsparse_spgemm_stage_numeric, &this->buffer_size_, this->workspace_)); + } + +private: + using handle_manager = + std::unique_ptr::element_type, + std::function>; + handle_manager handle_; + rocsparse::hip_allocator alloc_; + long unsigned int buffer_size_; + char* workspace_; + index result_shape_; + index_t result_nnz_; + rocsparse_spmat_descr mat_a_; + rocsparse_spmat_descr mat_b_; + rocsparse_spmat_descr mat_c_; + rocsparse_spmat_descr mat_d_; +}; + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_inspect(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, + D&& d) { + spgemm_handle.multiply_compute(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, D&& d) { + spgemm_handle.multiply_fill(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c, D&& d) { + spgemm_handle.multiply_compute(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, + D&& d) { + spgemm_handle.multiply_symbolic_fill(a, b, c, d); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v && __detail::has_csr_base +void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c, + D&& d) { + spgemm_handle.multiply_numeric(a, b, c, d); +} + +// the followings support C = A*B by giving null D matrix. +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_compute(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_fill(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_compute(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, + C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_symbolic_fill(a, b, c, scaled(0.0, d)); +} + +template + requires __detail::has_csr_base && __detail::has_csr_base && + __detail::is_csr_view_v +void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) { + auto d = __rocsparse::create_null_matrix>(); + spgemm_handle.multiply_numeric(a, b, c, scaled(0.0, d)); +} + +} // namespace spblas diff --git a/include/spblas/vendor/rocsparse/rocsparse.hpp b/include/spblas/vendor/rocsparse/rocsparse.hpp index 7caa698..014b2ba 100644 --- a/include/spblas/vendor/rocsparse/rocsparse.hpp +++ b/include/spblas/vendor/rocsparse/rocsparse.hpp @@ -1,3 +1,4 @@ #pragma once #include "multiply.hpp" +#include "multiply_spgemm.hpp" From 9263ec1978212f5437695f53f8787a76ecf18d70 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 18:26:42 +0100 Subject: [PATCH 02/10] add rocsparse spgemm with 3 args test --- include/spblas/vendor/rocsparse/exception.hpp | 4 +- test/gtest/CMakeLists.txt | 6 +- test/gtest/device/spgemm_test.cpp | 273 ++++++++++++++++++ 3 files changed, 280 insertions(+), 3 deletions(-) create mode 100644 test/gtest/device/spgemm_test.cpp diff --git a/include/spblas/vendor/rocsparse/exception.hpp b/include/spblas/vendor/rocsparse/exception.hpp index 6a2ed5c..cb836b5 100644 --- a/include/spblas/vendor/rocsparse/exception.hpp +++ b/include/spblas/vendor/rocsparse/exception.hpp @@ -10,7 +10,7 @@ namespace spblas { namespace __rocsparse { // Throw an exception if the hipError_t is not hipSuccess. -void throw_if_error(hipError_t error_code, std::string prefix = "") { +inline void throw_if_error(hipError_t error_code, std::string prefix = "") { if (error_code == hipSuccess) { return; } @@ -21,7 +21,7 @@ void throw_if_error(hipError_t error_code, std::string prefix = "") { } // Throw an exception if the rocsparse_status is not rocsparse_status_success. -void throw_if_error(rocsparse_status error_code) { +inline void throw_if_error(rocsparse_status error_code) { if (error_code == rocsparse_status_success) { return; } else if (error_code == rocsparse_status_invalid_handle) { diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 97c54d2..2ac50d3 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -22,7 +22,11 @@ if (NOT SPBLAS_GPU_BACKEND) triangular_solve_test.cpp ) else() - set(TEST_SOURCES device/spmv_test.cpp) + if(ENABLE_ROCSPARSE) + set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp) + else() + set(TEST_SOURCES device/spmv_test.cpp) + endif() add_device_test(TEST_SOURCES) endif() diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp new file mode 100644 index 0000000..4662ffd --- /dev/null +++ b/test/gtest/device/spgemm_test.cpp @@ -0,0 +1,273 @@ + +#include "../util.hpp" +#include + +#include +#include + +using value_t = float; +using index_t = spblas::index_t; +using offset_t = spblas::offset_t; + +TEST(CsrView, SpGEMM) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, d_b, d_c); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_AScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, spblas::scaled(alpha, d_a), d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, spblas::scaled(alpha, d_a), d_b, d_c); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_BScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, spblas::scaled(alpha, d_b), d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, spblas::scaled(alpha, d_b), d_c); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} From 59cfabe2dad7a806f68c552bbac2773e96e41fac Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 18:38:26 +0100 Subject: [PATCH 03/10] add rocsparse spgemm with 4 args test --- test/gtest/CMakeLists.txt | 2 +- .../device/rocsparse/spgemm_4args_test.cpp | 421 ++++++++++++++++++ 2 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 test/gtest/device/rocsparse/spgemm_4args_test.cpp diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 2ac50d3..c3b554b 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -23,7 +23,7 @@ if (NOT SPBLAS_GPU_BACKEND) ) else() if(ENABLE_ROCSPARSE) - set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp) + set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/rocsparse/spgemm_4args_test.cpp) else() set(TEST_SOURCES device/spmv_test.cpp) endif() diff --git a/test/gtest/device/rocsparse/spgemm_4args_test.cpp b/test/gtest/device/rocsparse/spgemm_4args_test.cpp new file mode 100644 index 0000000..f1b272b --- /dev/null +++ b/test/gtest/device/rocsparse/spgemm_4args_test.cpp @@ -0,0 +1,421 @@ + +#include "../../util.hpp" +#include + +#include +#include + +using value_t = float; +using index_t = spblas::index_t; +using offset_t = spblas::offset_t; + +TEST(CsrView, SpGEMM_4Args) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, d_b, d_c, d_d); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, d_b, d_c, d_d); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_4Args_AScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, scaled(alpha, d_a), d_b, d_c, d_d); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, scaled(alpha, d_a), d_b, d_c, d_d); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_4Args_BScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, scaled(alpha, d_b), d_c, d_d); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, scaled(alpha, d_b), d_c, d_d); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} + +TEST(CsrView, SpGEMM_4Args_DScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + auto [d_values, d_rowptr, d_colind, d_shape, d_nnz] = + spblas::generate_csr(m, n, nnz); + thrust::device_vector d_d_values(d_values); + thrust::device_vector d_d_rowptr(d_rowptr); + thrust::device_vector d_d_colind(d_colind); + spblas::csr_view d_d( + d_d_values.data().get(), d_d_rowptr.data().get(), + d_d_colind.data().get(), d_shape, d_nnz); + spblas::csr_view d(d_values, d_rowptr, + d_colind, d_shape, d_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_compute(state, d_a, d_b, d_c, scaled(alpha, d_d)); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_fill(state, d_a, d_b, d_c, scaled(alpha, d_d)); + + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + auto&& d_row = spblas::__backend::lookup_row(d, i); + for (auto&& [k, d_v] : d_row) { + c_row_ref[k] += alpha * d_v; + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } +} From ed7d26f81c97719119f2e47f64672fe05f4b72b8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 27 Mar 2025 19:16:27 +0100 Subject: [PATCH 04/10] add rocsparse spgemm symbolic/numeric test --- test/gtest/CMakeLists.txt | 2 +- test/gtest/device/spgemm_reuse_test.cpp | 324 ++++++++++++++++++++++++ 2 files changed, 325 insertions(+), 1 deletion(-) create mode 100644 test/gtest/device/spgemm_reuse_test.cpp diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index c3b554b..18de3e6 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -23,7 +23,7 @@ if (NOT SPBLAS_GPU_BACKEND) ) else() if(ENABLE_ROCSPARSE) - set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/rocsparse/spgemm_4args_test.cpp) + set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) else() set(TEST_SOURCES device/spmv_test.cpp) endif() diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp new file mode 100644 index 0000000..931c573 --- /dev/null +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -0,0 +1,324 @@ + +#include "../util.hpp" +#include + +#include +#include + +using value_t = float; +using index_t = spblas::index_t; +using offset_t = spblas::offset_t; + +TEST(CsrView, SpGEMMReuse) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, d_a, d_b, d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // we can change the value of a and b but only need to call + // multiply_numeric answer here. + if (i != 0) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin()); + thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); + } + spblas::multiply_numeric(state, d_a, d_b, d_c); + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} + +TEST(CsrView, SpGEMMReuse_AScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, scaled(alpha, d_a), d_b, d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // we can change the value of a and b but only need to call + // multiply_numeric answer here. + if (i != 0) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin()); + thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); + } + spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c); + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} + +TEST(CsrView, SpGEMMReuse_BScaled) { + value_t alpha = 2.0f; + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, d_a, scaled(alpha, d_b), d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // we can change the value of a and b but only need to call + // multiply_numeric answer here. + if (i != 0) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + thrust::copy(a_values.begin(), a_values.end(), d_a_values.begin()); + thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin()); + } + spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c); + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin()); + thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin()); + thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin()); + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += alpha * a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} From e7ecada6d7f0a8b808077a231d80e079402560e9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 17 Apr 2025 16:43:16 +0200 Subject: [PATCH 05/10] add the test when changing the pointer --- .../vendor/rocsparse/multiply_spgemm.hpp | 17 +++ test/gtest/device/spgemm_reuse_test.cpp | 124 ++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index 7f215df..74e6f70 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -202,6 +202,7 @@ class spgemm_state_t { void multiply_numeric(A&& a, B&& b, C&& c, D&& d) { auto a_base = __detail::get_ultimate_base(a); auto b_base = __detail::get_ultimate_base(b); + auto d_base = __detail::get_ultimate_base(d); using matrix_type = decltype(a_base); using input_type = decltype(b_base); using output_type = std::remove_reference_t; @@ -213,6 +214,22 @@ class spgemm_state_t { auto beta_optional = __detail::get_scaling_factor(d); value_type beta = beta_optional.value_or(1); + // Update the pointer from the matrix but they must contains the same + // sparsity as the previous call. + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_a_, a_base.rowptr().data(), a_base.colind().data(), + a_base.values().data())); + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_b_, b_base.rowptr().data(), b_base.colind().data(), + b_base.values().data())); + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data())); + if (d_base.values().data()) { + // when it is still a null matrix, we can not use set pointer function + __rocsparse::throw_if_error(rocsparse_csr_set_pointers( + this->mat_d_, d_base.rowptr().data(), d_base.colind().data(), + d_base.values().data())); + } __rocsparse::throw_if_error(rocsparse_spgemm( this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp index 931c573..836190e 100644 --- a/test/gtest/device/spgemm_reuse_test.cpp +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -322,3 +322,127 @@ TEST(CsrView, SpGEMMReuse_BScaled) { } } } + +TEST(CsrView, SpGEMMReuseAndChangePointer) { + for (auto&& [m, k, nnz] : util::dims) { + for (auto&& n : {m, k}) { + auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = + spblas::generate_csr(m, k, nnz); + thrust::device_vector d_a_values(a_values); + thrust::device_vector d_a_rowptr(a_rowptr); + thrust::device_vector d_a_colind(a_colind); + spblas::csr_view d_a( + d_a_values.data().get(), d_a_rowptr.data().get(), + d_a_colind.data().get(), a_shape, a_nnz); + spblas::csr_view a(a_values, a_rowptr, + a_colind, a_shape, a_nnz); + + auto [b_values, b_rowptr, b_colind, b_shape, b_nnz] = + spblas::generate_csr(k, n, nnz); + thrust::device_vector d_b_values(b_values); + thrust::device_vector d_b_rowptr(b_rowptr); + thrust::device_vector d_b_colind(b_colind); + spblas::csr_view d_b( + d_b_values.data().get(), d_b_rowptr.data().get(), + d_b_colind.data().get(), b_shape, b_nnz); + spblas::csr_view b(b_values, b_rowptr, + b_colind, b_shape, b_nnz); + + thrust::device_vector d_c_rowptr(m + 1); + + spblas::csr_view d_c( + nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0); + + spblas::spgemm_state_t state; + spblas::multiply_symbolic_compute(state, d_a, d_b, d_c); + auto nnz = state.result_nnz(); + thrust::device_vector d_c_values(nnz); + thrust::device_vector d_c_colind(nnz); + std::span d_c_values_span(d_c_values.data().get(), nnz); + std::span d_c_rowptr_span(d_c_rowptr.data().get(), m + 1); + std::span d_c_colind_span(d_c_colind.data().get(), nnz); + d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n}, + nnz); + + spblas::multiply_symbolic_fill(state, d_a, d_b, d_c); + std::mt19937 g(0); + for (int i = 0; i < 3; i++) { + // regenerate value of a and b; + std::uniform_real_distribution val_dist(0.0, 100.0); + for (auto& v : a_values) { + v = val_dist(g); + } + for (auto& v : b_values) { + v = val_dist(g); + } + // create different pointers than the symbolic phase, but they still + // hold the same sparsity + thrust::device_vector d_a_values_new(a_values); + thrust::device_vector d_a_colind_new(d_a_colind); + thrust::device_vector d_a_rowptr_new(d_a_rowptr); + thrust::device_vector d_b_values_new(b_values); + thrust::device_vector d_b_colind_new(d_b_colind); + thrust::device_vector d_b_rowptr_new(d_b_rowptr); + thrust::device_vector d_c_values_new(d_c_values); + thrust::device_vector d_c_colind_new(d_c_colind); + thrust::device_vector d_c_rowptr_new(d_c_rowptr); + spblas::csr_view d_a( + d_a_values_new.data().get(), d_a_rowptr_new.data().get(), + d_a_colind_new.data().get(), a_shape, a_nnz); + spblas::csr_view d_b( + d_b_values_new.data().get(), d_b_rowptr_new.data().get(), + d_b_colind_new.data().get(), b_shape, b_nnz); + spblas::csr_view d_c( + d_c_values_new.data().get(), d_c_rowptr_new.data().get(), + d_c_colind_new.data().get(), {m, n}, nnz); + // call numeric on new data + spblas::multiply_numeric(state, d_a, d_b, d_c); + // move c back to host memory + std::vector c_values(nnz); + std::vector c_rowptr(m + 1); + std::vector c_colind(nnz); + thrust::copy(d_c_values_new.begin(), d_c_values_new.end(), + c_values.begin()); + thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(), + c_rowptr.begin()); + thrust::copy(d_c_colind_new.begin(), d_c_colind_new.end(), + c_colind.begin()); + + spblas::csr_view c(c_values, c_rowptr, + c_colind, {m, n}, nnz); + + spblas::__backend::spa_accumulator c_row_ref( + spblas::__backend::shape(c)[1]); + + spblas::__backend::spa_accumulator c_row_acc( + spblas::__backend::shape(c)[1]); + + for (auto&& [i, a_row] : spblas::__backend::rows(a)) { + c_row_ref.clear(); + for (auto&& [k, a_v] : a_row) { + auto&& b_row = spblas::__backend::lookup_row(b, k); + + for (auto&& [j, b_v] : b_row) { + c_row_ref[j] += a_v * b_v; + } + } + + auto&& c_row = spblas::__backend::lookup_row(c, i); + + // Accumulate output into `c_row_acc` so that we can allow + // duplicate column indices. + c_row_acc.clear(); + for (auto&& [j, c_v] : c_row) { + c_row_acc[j] += c_v; + } + + for (auto&& [j, c_v] : c_row) { + EXPECT_EQ_(c_row_ref[j], c_row_acc[j]); + } + + EXPECT_EQ(c_row_ref.size(), c_row_acc.size()); + } + } + } + } +} From 4595c8d43ca74f2101066ab54a47f2bc8e0a9f9e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 8 May 2025 00:27:58 +0200 Subject: [PATCH 06/10] create a matrix descriptor helper --- .../spblas/vendor/rocsparse/descriptor.hpp | 33 +++++++++++++++++ include/spblas/vendor/rocsparse/multiply.hpp | 10 ++---- .../vendor/rocsparse/multiply_spgemm.hpp | 35 +++---------------- 3 files changed, 40 insertions(+), 38 deletions(-) create mode 100644 include/spblas/vendor/rocsparse/descriptor.hpp diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp new file mode 100644 index 0000000..e47e073 --- /dev/null +++ b/include/spblas/vendor/rocsparse/descriptor.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include + +#include + +#include +#include + +#include "exception.hpp" +#include "types.hpp" + +namespace spblas { +namespace __rocsparse { + +// create matrix descriptor from spblas csr view +template + requires __detail::is_csr_view_v +rocsparse_spmat_descr create_matrix_descr(mat&& a) { + using matrix_type = std::remove_cvref_t; + rocsparse_spmat_descr descr; + throw_if_error(rocsparse_create_csr_descr( + &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(), + a.rowptr().data(), a.colind().data(), a.values().data(), + to_rocsparse_indextype(), + to_rocsparse_indextype(), + rocsparse_index_base_zero, + to_rocsparse_datatype())); + return descr; +} + +} // namespace __rocsparse +} // namespace spblas diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp index 7d88c5c..212a5d1 100644 --- a/include/spblas/vendor/rocsparse/multiply.hpp +++ b/include/spblas/vendor/rocsparse/multiply.hpp @@ -10,6 +10,7 @@ #include #include +#include "descriptor.hpp" #include "exception.hpp" #include "hip_allocator.hpp" #include "types.hpp" @@ -59,14 +60,7 @@ class spmv_state_t { tensor_scalar_t alpha = alpha_optional.value_or(1); auto handle = this->handle_.get(); - rocsparse_spmat_descr mat; - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &mat, __backend::shape(a_base)[0], __backend::shape(a_base)[1], - a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), - a_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, to_rocsparse_datatype())); + rocsparse_spmat_descr mat = __rocsparse::create_matrix_descr(a_base); rocsparse_dnvec_descr vecb; rocsparse_dnvec_descr vecc; __rocsparse::throw_if_error(rocsparse_create_dnvec_descr( diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index 74e6f70..5eae63e 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -10,6 +10,7 @@ #include #include +#include "descriptor.hpp" #include "exception.hpp" #include "hip_allocator.hpp" #include "types.hpp" @@ -85,36 +86,10 @@ class spgemm_state_t { value_type beta = beta_optional.value_or(1); auto handle = this->handle_.get(); // Create sparse matrix A in CSR format - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_a_, __backend::shape(a_base)[0], __backend::shape(a_base)[1], - a_base.values().size(), a_base.rowptr().data(), a_base.colind().data(), - a_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_b_, __backend::shape(b_base)[0], __backend::shape(b_base)[1], - b_base.values().size(), b_base.rowptr().data(), b_base.colind().data(), - b_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_c_, __backend::shape(a_base)[0], __backend::shape(b_base)[1], - 0, c.rowptr().data(), NULL, NULL, - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_csr_descr( - &this->mat_d_, __backend::shape(d_base)[0], __backend::shape(d_base)[1], - d_base.values().size(), d_base.rowptr().data(), d_base.colind().data(), - d_base.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); + this->mat_a_ = __rocsparse::create_matrix_descr(a_base); + this->mat_b_ = __rocsparse::create_matrix_descr(b_base); + this->mat_c_ = __rocsparse::create_matrix_descr(c); + this->mat_d_ = __rocsparse::create_matrix_descr(d_base); //-------------------------------------------------------------------------- // SpGEMM Computation // ask buffer_size bytes for external memory From eeb3a674fb995d3e7a41b919e4c7197c78ddc703 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 8 May 2025 00:36:35 +0200 Subject: [PATCH 07/10] add vector descriptor helper --- include/spblas/vendor/rocsparse/descriptor.hpp | 12 ++++++++++++ include/spblas/vendor/rocsparse/multiply.hpp | 10 ++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp index e47e073..cf45934 100644 --- a/include/spblas/vendor/rocsparse/descriptor.hpp +++ b/include/spblas/vendor/rocsparse/descriptor.hpp @@ -29,5 +29,17 @@ rocsparse_spmat_descr create_matrix_descr(mat&& a) { return descr; } +// create dense vector from mdspan +template + requires __ranges::contiguous_range +rocsparse_dnvec_descr create_vector_descr(vec&& v) { + using vector_type = std::remove_cvref_t; + rocsparse_dnvec_descr descr; + throw_if_error(rocsparse_create_dnvec_descr( + &descr, v.size(), v.data(), + to_rocsparse_datatype())); + return descr; +} + } // namespace __rocsparse } // namespace spblas diff --git a/include/spblas/vendor/rocsparse/multiply.hpp b/include/spblas/vendor/rocsparse/multiply.hpp index 212a5d1..d05ff98 100644 --- a/include/spblas/vendor/rocsparse/multiply.hpp +++ b/include/spblas/vendor/rocsparse/multiply.hpp @@ -61,14 +61,8 @@ class spmv_state_t { auto handle = this->handle_.get(); rocsparse_spmat_descr mat = __rocsparse::create_matrix_descr(a_base); - rocsparse_dnvec_descr vecb; - rocsparse_dnvec_descr vecc; - __rocsparse::throw_if_error(rocsparse_create_dnvec_descr( - &vecb, b_base.size(), b_base.data(), - to_rocsparse_datatype())); - __rocsparse::throw_if_error(rocsparse_create_dnvec_descr( - &vecc, c.size(), c.data(), - to_rocsparse_datatype())); + rocsparse_dnvec_descr vecb = __rocsparse::create_vector_descr(b_base); + rocsparse_dnvec_descr vecc = __rocsparse::create_vector_descr(c); value_type alpha_val = alpha; value_type beta = 0.0; long unsigned int buffer_size = 0; From 59c0b7d0aac5e44ece6edd299a22f36b0a138605 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 22 May 2025 16:24:31 +0200 Subject: [PATCH 08/10] review update --- include/spblas/vendor/rocsparse/multiply_spgemm.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index 5eae63e..c6ede83 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -85,19 +86,19 @@ class spgemm_state_t { auto beta_optional = __detail::get_scaling_factor(d); value_type beta = beta_optional.value_or(1); auto handle = this->handle_.get(); - // Create sparse matrix A in CSR format + // Create sparse matrix in CSR format this->mat_a_ = __rocsparse::create_matrix_descr(a_base); this->mat_b_ = __rocsparse::create_matrix_descr(b_base); this->mat_c_ = __rocsparse::create_matrix_descr(c); this->mat_d_ = __rocsparse::create_matrix_descr(d_base); - //-------------------------------------------------------------------------- - // SpGEMM Computation // ask buffer_size bytes for external memory __rocsparse::throw_if_error(rocsparse_spgemm( handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, to_rocsparse_datatype(), rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr)); + // allocate the new buffer if it requires more than what the buffer + // currently has. if (buffer_size > this->buffer_size_) { this->alloc_.deallocate(workspace_, this->buffer_size_); this->buffer_size_ = buffer_size; @@ -113,6 +114,7 @@ class spgemm_state_t { int64_t c_num_cols; __rocsparse::throw_if_error(rocsparse_spmat_get_size( this->mat_c_, &c_num_rows, &c_num_cols, &this->result_nnz_)); + // form a shape this->result_shape_ = index(c_num_rows, c_num_cols); } @@ -218,10 +220,10 @@ class spgemm_state_t { std::function>; handle_manager handle_; rocsparse::hip_allocator alloc_; - long unsigned int buffer_size_; + std::uint64_t buffer_size_; char* workspace_; index result_shape_; - index_t result_nnz_; + std::int64_t result_nnz_; rocsparse_spmat_descr mat_a_; rocsparse_spmat_descr mat_b_; rocsparse_spmat_descr mat_c_; From 5f9c0905727f4efcad83e1153954e0ea0399ef3b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 7 Aug 2025 16:13:58 +0200 Subject: [PATCH 09/10] fix and adapt the changes from main --- .../spblas/vendor/rocsparse/descriptor.hpp | 45 ------------------- .../vendor/rocsparse/multiply_spgemm.hpp | 19 ++++---- .../device/rocsparse/spgemm_4args_test.cpp | 8 ++-- test/gtest/device/spgemm_reuse_test.cpp | 8 ++-- test/gtest/device/spgemm_test.cpp | 6 +-- 5 files changed, 20 insertions(+), 66 deletions(-) delete mode 100644 include/spblas/vendor/rocsparse/descriptor.hpp diff --git a/include/spblas/vendor/rocsparse/descriptor.hpp b/include/spblas/vendor/rocsparse/descriptor.hpp deleted file mode 100644 index cf45934..0000000 --- a/include/spblas/vendor/rocsparse/descriptor.hpp +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include - -#include - -#include -#include - -#include "exception.hpp" -#include "types.hpp" - -namespace spblas { -namespace __rocsparse { - -// create matrix descriptor from spblas csr view -template - requires __detail::is_csr_view_v -rocsparse_spmat_descr create_matrix_descr(mat&& a) { - using matrix_type = std::remove_cvref_t; - rocsparse_spmat_descr descr; - throw_if_error(rocsparse_create_csr_descr( - &descr, __backend::shape(a)[0], __backend::shape(a)[1], a.values().size(), - a.rowptr().data(), a.colind().data(), a.values().data(), - to_rocsparse_indextype(), - to_rocsparse_indextype(), - rocsparse_index_base_zero, - to_rocsparse_datatype())); - return descr; -} - -// create dense vector from mdspan -template - requires __ranges::contiguous_range -rocsparse_dnvec_descr create_vector_descr(vec&& v) { - using vector_type = std::remove_cvref_t; - rocsparse_dnvec_descr descr; - throw_if_error(rocsparse_create_dnvec_descr( - &descr, v.size(), v.data(), - to_rocsparse_datatype())); - return descr; -} - -} // namespace __rocsparse -} // namespace spblas diff --git a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp index c6ede83..0b58f37 100644 --- a/include/spblas/vendor/rocsparse/multiply_spgemm.hpp +++ b/include/spblas/vendor/rocsparse/multiply_spgemm.hpp @@ -11,7 +11,6 @@ #include #include -#include "descriptor.hpp" #include "exception.hpp" #include "hip_allocator.hpp" #include "types.hpp" @@ -87,15 +86,15 @@ class spgemm_state_t { value_type beta = beta_optional.value_or(1); auto handle = this->handle_.get(); // Create sparse matrix in CSR format - this->mat_a_ = __rocsparse::create_matrix_descr(a_base); - this->mat_b_ = __rocsparse::create_matrix_descr(b_base); - this->mat_c_ = __rocsparse::create_matrix_descr(c); - this->mat_d_ = __rocsparse::create_matrix_descr(d_base); + this->mat_a_ = __rocsparse::create_rocsparse_handle(a_base); + this->mat_b_ = __rocsparse::create_rocsparse_handle(b_base); + this->mat_c_ = __rocsparse::create_rocsparse_handle(c); + this->mat_d_ = __rocsparse::create_rocsparse_handle(d_base); // ask buffer_size bytes for external memory __rocsparse::throw_if_error(rocsparse_spgemm( handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_buffer_size, &buffer_size, nullptr)); // allocate the new buffer if it requires more than what the buffer // currently has. @@ -107,7 +106,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( handle, rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_nnz, &this->buffer_size_, this->workspace_)); // get matrix C non-zero entries and size int64_t c_num_rows; @@ -141,7 +140,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_compute, &this->buffer_size_, workspace_)); } @@ -168,7 +167,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_symbolic, &this->buffer_size_, this->workspace_)); } @@ -210,7 +209,7 @@ class spgemm_state_t { __rocsparse::throw_if_error(rocsparse_spgemm( this->handle_.get(), rocsparse_operation_none, rocsparse_operation_none, &alpha, this->mat_a_, this->mat_b_, &beta, this->mat_d_, this->mat_c_, - to_rocsparse_datatype(), rocsparse_spgemm_alg_default, + detail::rocsparse_data_type_v, rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_numeric, &this->buffer_size_, this->workspace_)); } diff --git a/test/gtest/device/rocsparse/spgemm_4args_test.cpp b/test/gtest/device/rocsparse/spgemm_4args_test.cpp index f1b272b..1a8e75a 100644 --- a/test/gtest/device/rocsparse/spgemm_4args_test.cpp +++ b/test/gtest/device/rocsparse/spgemm_4args_test.cpp @@ -9,7 +9,7 @@ using value_t = float; using index_t = spblas::index_t; using offset_t = spblas::offset_t; -TEST(CsrView, SpGEMM_4Args) { +TEST(thrust_CsrView, SpGEMM_4Args) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = @@ -111,7 +111,7 @@ TEST(CsrView, SpGEMM_4Args) { } } -TEST(CsrView, SpGEMM_4Args_AScaled) { +TEST(thrust_CsrView, SpGEMM_4Args_AScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -214,7 +214,7 @@ TEST(CsrView, SpGEMM_4Args_AScaled) { } } -TEST(CsrView, SpGEMM_4Args_BScaled) { +TEST(thrust_CsrView, SpGEMM_4Args_BScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -317,7 +317,7 @@ TEST(CsrView, SpGEMM_4Args_BScaled) { } } -TEST(CsrView, SpGEMM_4Args_DScaled) { +TEST(thrust_CsrView, SpGEMM_4Args_DScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp index 836190e..b798d6e 100644 --- a/test/gtest/device/spgemm_reuse_test.cpp +++ b/test/gtest/device/spgemm_reuse_test.cpp @@ -9,7 +9,7 @@ using value_t = float; using index_t = spblas::index_t; using offset_t = spblas::offset_t; -TEST(CsrView, SpGEMMReuse) { +TEST(thrust_CsrView, SpGEMMReuse) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = @@ -113,7 +113,7 @@ TEST(CsrView, SpGEMMReuse) { } } -TEST(CsrView, SpGEMMReuse_AScaled) { +TEST(thrust_CsrView, SpGEMMReuse_AScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -218,7 +218,7 @@ TEST(CsrView, SpGEMMReuse_AScaled) { } } -TEST(CsrView, SpGEMMReuse_BScaled) { +TEST(thrust_CsrView, SpGEMMReuse_BScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -323,7 +323,7 @@ TEST(CsrView, SpGEMMReuse_BScaled) { } } -TEST(CsrView, SpGEMMReuseAndChangePointer) { +TEST(thrust_CsrView, SpGEMMReuseAndChangePointer) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = diff --git a/test/gtest/device/spgemm_test.cpp b/test/gtest/device/spgemm_test.cpp index 4662ffd..98e1aed 100644 --- a/test/gtest/device/spgemm_test.cpp +++ b/test/gtest/device/spgemm_test.cpp @@ -9,7 +9,7 @@ using value_t = float; using index_t = spblas::index_t; using offset_t = spblas::offset_t; -TEST(CsrView, SpGEMM) { +TEST(thrust_CsrView, SpGEMM) { for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { auto [a_values, a_rowptr, a_colind, a_shape, a_nnz] = @@ -96,7 +96,7 @@ TEST(CsrView, SpGEMM) { } } -TEST(CsrView, SpGEMM_AScaled) { +TEST(thrust_CsrView, SpGEMM_AScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { @@ -184,7 +184,7 @@ TEST(CsrView, SpGEMM_AScaled) { } } -TEST(CsrView, SpGEMM_BScaled) { +TEST(thrust_CsrView, SpGEMM_BScaled) { value_t alpha = 2.0f; for (auto&& [m, k, nnz] : util::dims) { for (auto&& n : {m, k}) { From e36a51616e67c1fcb52f26d2987bc5ac50e80510 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 20 Aug 2025 16:08:22 +0200 Subject: [PATCH 10/10] fix meaningless cmake function Co-authored-by: Benjamin Brock --- test/gtest/CMakeLists.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 84248cf..cd96de1 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -13,16 +13,12 @@ if (SPBLAS_CPU_BACKEND) transpose_test.cpp triangular_solve_test.cpp) endif() -function(add_rocsparse_lang file_list) - if (ENABLE_ROCSPARSE) - set_source_files_properties(${${file_list}} PROPERTIES LANGUAGE HIP) - endif() -endfunction() + # GPU tests if (SPBLAS_GPU_BACKEND) if (ENABLE_ROCSPARSE) set(GPUTEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp) - add_rocsparse_lang(GPUTEST_SOURCES) + set_source_files_properties(${GPUTEST_SOURCES} PROPERTIES LANGUAGE HIP) else () set(GPUTEST_SOURCES device/spmv_test.cpp) endif ()