From 03e791e1e1b60e2d1373b21d0236fe1259d0625e Mon Sep 17 00:00:00 2001 From: ThreeMonth03 Date: Fri, 26 Sep 2025 18:33:36 +0800 Subject: [PATCH] Accelerate the mean operations. --- cpp/modmesh/buffer/SimpleArray.hpp | 307 +++++++++++++++++++++++++---- profiling/profile_statistic_ops.py | 113 ++++++++++- tests/test_buffer.py | 6 + 3 files changed, 376 insertions(+), 50 deletions(-) diff --git a/cpp/modmesh/buffer/SimpleArray.hpp b/cpp/modmesh/buffer/SimpleArray.hpp index 865b4abf1..6950ae408 100644 --- a/cpp/modmesh/buffer/SimpleArray.hpp +++ b/cpp/modmesh/buffer/SimpleArray.hpp @@ -42,6 +42,8 @@ #include #include #include +#include +#include #if defined(_MSC_VER) #include @@ -141,6 +143,250 @@ struct select_real_t> using type = U; }; +template +class SimpleArrayMixinSum +{ + +private: + + using internal_types = detail::SimpleArrayInternalTypes; + +public: + + using value_type = typename internal_types::value_type; + + value_type sum() const + { + auto athis = static_cast(this); + if (athis->is_c_contiguous()) + { + return sum_contiguous(); + } + else + { + return sum_non_contiguous(); + } + } + + value_type sum_contiguous() const + { + auto athis = static_cast(this); + value_type result; + if constexpr (is_complex_v) + { + result = value_type{}; + } + else + { + result = 0; + } + sum_unrolled_generic(athis->data(), athis->size(), 1, result); + return result; + } + +private: + value_type sum_non_contiguous() const + { + auto athis = static_cast(this); + const size_t ndim = athis->ndim(); + const auto & shape = athis->shape(); + const auto & stride = athis->stride(); + + // Calculate the size of the last dimension for loop unrolling + const size_t last_dim_size = shape[ndim - 1]; + const size_t last_stride = stride[ndim - 1]; + + // Calculate total prefix combinations to decide on parallelization + const size_t total_combinations = calculate_total_combinations(shape, ndim - 1); + + // Use parallel processing for large arrays + if (total_combinations > 10000) + { + return sum_non_contiguous_parallel(athis, shape, stride, last_dim_size, last_stride); + } + else + { + return sum_non_contiguous_sequential(athis, shape, stride, last_dim_size, last_stride); + } + } + + value_type sum_non_contiguous_sequential(A const * athis, const small_vector & shape, const small_vector & stride, size_t last_dim_size, size_t last_stride) const + { + value_type result; + if constexpr (is_complex_v) + { + result = value_type{}; + } + else + { + result = 0; + } + + const size_t ndim = shape.size(); + small_vector prefix_idx(ndim - 1, 0); + + do + { + size_t base_offset = 0; + for (size_t i = 0; i < ndim - 1; ++i) + { + base_offset += prefix_idx[i] * stride[i]; + } + + const value_type * data_ptr = athis->data() + base_offset; + sum_unrolled_generic(data_ptr, last_dim_size, last_stride, result); + + } while (next_cartesian_product_prefix(prefix_idx, shape, ndim - 1)); + + return result; + } + + value_type sum_non_contiguous_parallel(A const * athis, const small_vector & shape, const small_vector & stride, size_t last_dim_size, size_t last_stride) const + { + const size_t ndim = shape.size(); + const size_t prefix_len = ndim - 1; + const size_t total_combinations = calculate_total_combinations(shape, prefix_len); + + const size_t num_threads = static_cast(std::thread::hardware_concurrency()); + const size_t combinations_per_thread = (total_combinations + num_threads - 1) / num_threads; + + small_vector thread_results(num_threads); + std::vector threads; + + for (size_t t = 0; t < num_threads; ++t) + { + const size_t start_idx = t * combinations_per_thread; + const size_t end_idx = std::min(start_idx + combinations_per_thread, total_combinations); + + if (start_idx < total_combinations) + { + threads.emplace_back([this, athis, &shape, &stride, last_dim_size, last_stride, start_idx, end_idx, &thread_results, t, prefix_len]() + { + value_type local_result; + if constexpr (is_complex_v) + { + local_result = value_type{}; + } + else + { + local_result = 0; + } + + for (size_t combo_idx = start_idx; combo_idx < end_idx; ++combo_idx) + { + small_vector prefix_idx(prefix_len); + size_t temp_idx = combo_idx; + for (size_t i = 0; i < prefix_len; ++i) + { + prefix_idx[i] = temp_idx % shape[i]; + temp_idx /= shape[i]; + } + + size_t base_offset = 0; + for (size_t i = 0; i < prefix_len; ++i) + { + base_offset += prefix_idx[i] * stride[i]; + } + + const value_type* data_ptr = athis->data() + base_offset; + sum_unrolled_generic(data_ptr, last_dim_size, last_stride, local_result); + } + + thread_results[t] = local_result; }); + } + } + + for (auto & thread : threads) + { + thread.join(); + } + + value_type result; + if constexpr (is_complex_v) + { + result = value_type{}; + } + else + { + result = 0; + } + + for (const auto & thread_result : thread_results) + { + result += thread_result; + } + + return result; + } + + size_t calculate_total_combinations(const small_vector & shape, size_t prefix_len) const + { + size_t total = 1; + for (size_t i = 0; i < prefix_len; ++i) + { + total *= shape[i]; + } + return total; + } + + void sum_unrolled_generic(const value_type * data_ptr, size_t size, size_t stride, value_type & result) const + { + if constexpr (!std::is_same_v>) + { + const int unroll = (sizeof(value_type) <= 1 ? 16 : sizeof(value_type) <= 2 ? 8 + : sizeof(value_type) <= 4 ? 4 + : 2); + + size_t i = 0; + for (; i + unroll <= size; i += unroll) + { + for (int u = 0; u < unroll; ++u) + { + result += data_ptr[(i + u) * stride]; + } + } + + for (; i < size; ++i) + { + result += data_ptr[i * stride]; + } + } + else + { + const int unroll = 8; + size_t i = 0; + + for (; i + unroll <= size; i += unroll) + { + for (int u = 0; u < unroll; ++u) + { + result |= data_ptr[(i + u) * stride]; + } + } + + for (; i < size; ++i) + { + result |= data_ptr[i * stride]; + } + } + } + + bool next_cartesian_product_prefix(small_vector & idx, + const small_vector & shape, + size_t prefix_len) const + { + for (size_t i = prefix_len; i > 0; --i) + { + if (++idx[i - 1] < shape[i - 1]) + { + return true; + } + idx[i - 1] = 0; + } + return false; + } +}; /* end class SimpleArrayMixinSum */ + template class SimpleArrayMixinCalculators { @@ -342,14 +588,8 @@ class SimpleArrayMixinCalculators value_type mean() const { auto athis = static_cast(this); - auto sidx = athis->first_sidx(); - value_type sum = 0; - int64_t total = 0; - do - { - sum += athis->at(sidx); - ++total; - } while (athis->next_sidx(sidx)); + int64_t total = athis->size(); + value_type sum = athis->sum(); return sum / static_cast(total); } @@ -464,36 +704,6 @@ class SimpleArrayMixinCalculators return initial; } - value_type sum() const - { - value_type initial; - if constexpr (is_complex_v) - { - initial = value_type(); - } - else - { - initial = 0; - } - - auto athis = static_cast(this); - if constexpr (!std::is_same_v>) - { - for (size_t i = 0; i < athis->size(); ++i) - { - initial += athis->data(i); - } - } - else - { - for (size_t i = 0; i < athis->size(); ++i) - { - initial |= athis->data(i); - } - } - return initial; - } - A abs() const { auto athis = static_cast(this); @@ -1014,6 +1224,7 @@ class SimpleArrayMixinSearch template class SimpleArray : public detail::SimpleArrayMixinModifiers, T> + , public detail::SimpleArrayMixinSum, T> , public detail::SimpleArrayMixinCalculators, T> , public detail::SimpleArrayMixinSort, T> , public detail::SimpleArrayMixinSearch, T> @@ -1464,21 +1675,33 @@ class SimpleArray value_type const * body() const { return m_body; } value_type * body() { return m_body; } + bool is_c_contiguous() const { return is_c_contiguous(m_shape, m_stride); } + private: - void check_c_contiguous(small_vector const & shape, - small_vector const & stride) const + bool is_c_contiguous(small_vector const & shape, + small_vector const & stride) const { if (stride[stride.size() - 1] != 1) { - throw std::runtime_error("SimpleArray: C contiguous stride must end with 1"); + return false; } for (size_t it = 0; it < shape.size() - 1; ++it) { if (stride[it] != shape[it + 1] * stride[it + 1]) { - throw std::runtime_error("SimpleArray: C contiguous stride must match shape"); + return false; } } + return true; + } + + void check_c_contiguous(small_vector const & shape, + small_vector const & stride) const + { + if (!is_c_contiguous(shape, stride)) + { + throw std::runtime_error("SimpleArray: C contiguous stride must match shape and end with 1"); + } } void check_f_contiguous(small_vector const & shape, diff --git a/profiling/profile_statistic_ops.py b/profiling/profile_statistic_ops.py index c0bb8aac5..c24596275 100644 --- a/profiling/profile_statistic_ops.py +++ b/profiling/profile_statistic_ops.py @@ -109,9 +109,10 @@ def profile_average_np(src): def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10, - axis=None): + axis=None, dims=None): axis_str = f" (axis={axis})" if axis is not None else "" - print(f"\n# {op} (dtype={dtype}){axis_str}") + dims_str = f" ({dims}D)" if dims is not None else "" + print(f"\n# {op} (dtype={dtype}){axis_str}{dims_str}") results = [] @@ -130,9 +131,29 @@ def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10, else: src = np.random.randint(-1000, 1000, shape, dtype=dtype) src_sa = make_container(src, dtype) + elif dims == 3: + cube_size = int(N ** (1/3)) + if cube_size < 2: + cube_size = 2 + larger_shape = (cube_size * 3, cube_size * 3, cube_size * 3) + if np.issubdtype(dtype, np.floating): + src = np.random.rand(*larger_shape).astype(dtype, copy=False) + else: + if dtype == np.int8: + src = np.random.randint( + -100, 100, larger_shape, dtype=dtype) + elif dtype == np.int32: + src = np.random.randint( + -1000, 1000, larger_shape, dtype=dtype) + else: + src = np.random.randint( + -1000, 1000, larger_shape, dtype=dtype) + src = src[::3, ::3, ::3] + shape = src.shape + src_sa = make_container(src, dtype) else: if np.issubdtype(dtype, np.floating): - src = np.random.rand(N).astype(dtype, copy=False) + src = np.random.rand(N * 5).astype(dtype, copy=False) else: if dtype == np.int8: src = np.random.randint(-100, 100, N, dtype=dtype) @@ -142,7 +163,7 @@ def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10, src = np.random.randint(-1000, 1000, N, dtype=dtype) src_sa = make_container(src, dtype) modmesh.call_profiler.reset() - for _ in range(it): + for i in range(it): if axis is not None: prof_func_np(src, axis=axis) prof_func_sa(src_sa, axis=axis) @@ -159,14 +180,22 @@ def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10, result_row = { 'operation': op, 'dtype': str(dtype), - 'size': N if axis is None else f"{shape[0]}x{shape[1]}", + 'size': N if axis is None and dims is None + else f"{shape[0]}x{shape[1]}" if axis is not None + else f"{shape[0]}x{shape[1]}x{shape[2]}" if dims == 3 else N, 'size_numeric': N, - 'axis': axis + 'axis': axis, + 'dims': dims } result_row.update(out) results.append(result_row) - print(f"## N = {N if axis is None else shape}") + if axis is not None: + print(f"## N = {shape}") + elif dims == 3: + print(f"## N = {shape}") + else: + print(f"## N = {N}") def print_row(*cols): print(str.format("| {:10s} | {:15s} | {:15s} |", *(cols[0:3]))) @@ -242,6 +271,7 @@ def create_performance_plots(all_results): for op in operations: create_1d_performance_plot(all_results, op, dtypes) create_axis_performance_plot(all_results, op, dtypes) + create_3d_performance_plot(all_results, op, dtypes) def create_1d_performance_plot(all_results, op, dtypes): @@ -255,7 +285,9 @@ def create_1d_performance_plot(all_results, op, dtypes): axes = axes.reshape(1, -1) op_data = [r for r in all_results - if r['operation'] == op and r['axis'] is None] + if r['operation'] == op + and r['axis'] is None + and r.get('dims') is None] for i, dtype in enumerate(sorted_dtypes): dtype_data = [r for r in op_data if r['dtype'] == dtype] @@ -366,12 +398,63 @@ def create_axis_performance_plot(all_results, op, dtypes): plt.close() +def create_3d_performance_plot(all_results, op, dtypes): + sorted_dtypes = sorted(dtypes) + n_dtypes = len(sorted_dtypes) + + fig, axes = plt.subplots(n_dtypes, 2, figsize=(15, 4*n_dtypes)) + fig.suptitle(f'3D Performance: {op.title()}', fontsize=16) + + if n_dtypes == 1: + axes = axes.reshape(1, -1) + + op_data = [r for r in all_results + if r['operation'] == op and r['dims'] == 3] + + for i, dtype in enumerate(sorted_dtypes): + dtype_data = [r for r in op_data if r['dtype'] == dtype] + + sizes = [r['size_numeric'] for r in dtype_data] + np_times = [r['np'] for r in dtype_data] + sa_times = [r['sa'] for r in dtype_data] + + ax1 = axes[i, 0] + ax1.plot(sizes, np_times, 'o-', label='NumPy', color='blue') + ax1.plot(sizes, sa_times, 's-', label='SimpleArray', color='red') + ax1.set_xlabel('Array Size') + ax1.set_ylabel('Time per Call (ms)') + ax1.set_title(f'{dtype} - 3D Performance') + ax1.set_xscale('log') + ax1.set_yscale('log') + ax1.legend() + ax1.grid(True, alpha=0.3) + + ax2 = axes[i, 1] + speedups = [r['np'] / r['sa'] for r in dtype_data] + ax2.plot(sizes, speedups, 'D-', color='green') + ax2.set_xlabel('Array Size') + ax2.set_ylabel('Speedup (NumPy / SimpleArray)') + ax2.set_title(f'{dtype} - 3D Speedup') + ax2.set_xscale('log') + ax2.axhline(y=1, color='black', linestyle='--', alpha=0.5) + ax2.grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig('profiling/results/png/' + + 'performance_3d_non_contiguous_' + + f'{op}.png', + dpi=300, bbox_inches='tight') + plt.close() + + def main(): + print("Starting performance profiling...") sizes = [10, 10**3, 10**6] dtypes = [np.float64, np.float32, np.int64, np.int32, np.int8] all_results = [] for dtype in dtypes: + print(f"Testing dtype: {dtype}") operations_1d = [ ("median", profile_median_np, profile_median_sa), ("mean", profile_mean_np, profile_mean_sa), @@ -399,6 +482,20 @@ def main(): op, prof_func_np, prof_func_sa, dtype, sizes, axis=axis) all_results.extend(results) + # 3D without axis tests + operations_3d = [ + ("median", profile_median_np, profile_median_sa), + ("mean", profile_mean_np, profile_mean_sa), + ("var", profile_var_np, profile_var_sa), + ("std", profile_std_np, profile_std_sa), + ("average", profile_average_np, profile_average_sa) + ] + + for op, prof_func_np, prof_func_sa in operations_3d: + results = profile_stat_op( + op, prof_func_np, prof_func_sa, dtype, sizes, dims=3) + all_results.extend(results) + create_performance_plots(all_results) diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 38165d03c..7d414e998 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -986,6 +986,12 @@ def test_minmaxsum(self): self.assertEqual(sarr.min(), -2.3) self.assertEqual(sarr.max(), 9.2) + def test_sum_non_contiguous(self): + nparr = np.arange(625, dtype='float64').reshape((5, 5, 5, 5)) + nparr = nparr[:3:2, 1:4:2, :3:3, 3:4:2] + sarr = modmesh.SimpleArrayFloat64(array=nparr) + self.assertEqual(sarr.sum(), np.sum(nparr)) + def test_abs(self): sarr = modmesh.SimpleArrayInt64(shape=(3, 2), value=-2) self.assertEqual(sarr.sum(), -2 * 3 * 2)