From 03e791e1e1b60e2d1373b21d0236fe1259d0625e Mon Sep 17 00:00:00 2001
From: ThreeMonth03 <austin20463@gmail.com>
Date: Fri, 26 Sep 2025 18:33:36 +0800
Subject: [PATCH] Accelerate the mean operations.

---
 cpp/modmesh/buffer/SimpleArray.hpp | 307 +++++++++++++++++++++++++----
 profiling/profile_statistic_ops.py | 113 ++++++++++-
 tests/test_buffer.py               |   6 +
 3 files changed, 376 insertions(+), 50 deletions(-)
diff --git a/cpp/modmesh/buffer/SimpleArray.hpp b/cpp/modmesh/buffer/SimpleArray.hpp
index 865b4abf1..6950ae408 100644
--- a/cpp/modmesh/buffer/SimpleArray.hpp
+++ b/cpp/modmesh/buffer/SimpleArray.hpp
@@ -42,6 +42,8 @@
 #include <functional>
 #include <numeric>
 #include <algorithm>
+#include <thread>
+#include <vector>
 
 #if defined(_MSC_VER)
 #include <BaseTsd.h>
@@ -141,6 +143,250 @@ struct select_real_t<Complex<U>>
     using type = U;
 };
 
+template <typename A, typename T>
+class SimpleArrayMixinSum
+{
+
+private:
+
+    using internal_types = detail::SimpleArrayInternalTypes<T>;
+
+public:
+
+    using value_type = typename internal_types::value_type;
+
+    value_type sum() const
+    {
+        auto athis = static_cast<A const *>(this);
+        if (athis->is_c_contiguous())
+        {
+            return sum_contiguous();
+        }
+        else
+        {
+            return sum_non_contiguous();
+        }
+    }
+
+    value_type sum_contiguous() const
+    {
+        auto athis = static_cast<A const *>(this);
+        value_type result;
+        if constexpr (is_complex_v<value_type>)
+        {
+            result = value_type{};
+        }
+        else
+        {
+            result = 0;
+        }
+        sum_unrolled_generic(athis->data(), athis->size(), 1, result);
+        return result;
+    }
+
+private:
+    value_type sum_non_contiguous() const
+    {
+        auto athis = static_cast<A const *>(this);
+        const size_t ndim = athis->ndim();
+        const auto & shape = athis->shape();
+        const auto & stride = athis->stride();
+
+        // Calculate the size of the last dimension for loop unrolling
+        const size_t last_dim_size = shape[ndim - 1];
+        const size_t last_stride = stride[ndim - 1];
+
+        // Calculate total prefix combinations to decide on parallelization
+        const size_t total_combinations = calculate_total_combinations(shape, ndim - 1);
+
+        // Use parallel processing for large arrays
+        if (total_combinations > 10000)
+        {
+            return sum_non_contiguous_parallel(athis, shape, stride, last_dim_size, last_stride);
+        }
+        else
+        {
+            return sum_non_contiguous_sequential(athis, shape, stride, last_dim_size, last_stride);
+        }
+    }
+
+    value_type sum_non_contiguous_sequential(A const * athis, const small_vector<size_t> & shape, const small_vector<size_t> & stride, size_t last_dim_size, size_t last_stride) const
+    {
+        value_type result;
+        if constexpr (is_complex_v<value_type>)
+        {
+            result = value_type{};
+        }
+        else
+        {
+            result = 0;
+        }
+
+        const size_t ndim = shape.size();
+        small_vector<size_t> prefix_idx(ndim - 1, 0);
+
+        do
+        {
+            size_t base_offset = 0;
+            for (size_t i = 0; i < ndim - 1; ++i)
+            {
+                base_offset += prefix_idx[i] * stride[i];
+            }
+
+            const value_type * data_ptr = athis->data() + base_offset;
+            sum_unrolled_generic(data_ptr, last_dim_size, last_stride, result);
+
+        } while (next_cartesian_product_prefix(prefix_idx, shape, ndim - 1));
+
+        return result;
+    }
+
+    value_type sum_non_contiguous_parallel(A const * athis, const small_vector<size_t> & shape, const small_vector<size_t> & stride, size_t last_dim_size, size_t last_stride) const
+    {
+        const size_t ndim = shape.size();
+        const size_t prefix_len = ndim - 1;
+        const size_t total_combinations = calculate_total_combinations(shape, prefix_len);
+
+        const size_t num_threads = static_cast<size_t>(std::thread::hardware_concurrency());
+        const size_t combinations_per_thread = (total_combinations + num_threads - 1) / num_threads;
+
+        small_vector<value_type> thread_results(num_threads);
+        std::vector<std::thread> threads;
+
+        for (size_t t = 0; t < num_threads; ++t)
+        {
+            const size_t start_idx = t * combinations_per_thread;
+            const size_t end_idx = std::min(start_idx + combinations_per_thread, total_combinations);
+
+            if (start_idx < total_combinations)
+            {
+                threads.emplace_back([this, athis, &shape, &stride, last_dim_size, last_stride, start_idx, end_idx, &thread_results, t, prefix_len]()
+                                     {
+                    value_type local_result;
+                    if constexpr (is_complex_v<value_type>)
+                    {
+                        local_result = value_type{};
+                    }
+                    else
+                    {
+                        local_result = 0;
+                    }
+                    
+                    for (size_t combo_idx = start_idx; combo_idx < end_idx; ++combo_idx)
+                    {
+                        small_vector<size_t> prefix_idx(prefix_len);
+                        size_t temp_idx = combo_idx;
+                        for (size_t i = 0; i < prefix_len; ++i)
+                        {
+                            prefix_idx[i] = temp_idx % shape[i];
+                            temp_idx /= shape[i];
+                        }
+                        
+                        size_t base_offset = 0;
+                        for (size_t i = 0; i < prefix_len; ++i)
+                        {
+                            base_offset += prefix_idx[i] * stride[i];
+                        }
+                        
+                        const value_type* data_ptr = athis->data() + base_offset;
+                        sum_unrolled_generic(data_ptr, last_dim_size, last_stride, local_result);
+                    }
+                    
+                    thread_results[t] = local_result; });
+            }
+        }
+
+        for (auto & thread : threads)
+        {
+            thread.join();
+        }
+
+        value_type result;
+        if constexpr (is_complex_v<value_type>)
+        {
+            result = value_type{};
+        }
+        else
+        {
+            result = 0;
+        }
+
+        for (const auto & thread_result : thread_results)
+        {
+            result += thread_result;
+        }
+
+        return result;
+    }
+
+    size_t calculate_total_combinations(const small_vector<size_t> & shape, size_t prefix_len) const
+    {
+        size_t total = 1;
+        for (size_t i = 0; i < prefix_len; ++i)
+        {
+            total *= shape[i];
+        }
+        return total;
+    }
+
+    void sum_unrolled_generic(const value_type * data_ptr, size_t size, size_t stride, value_type & result) const
+    {
+        if constexpr (!std::is_same_v<bool, std::remove_const_t<value_type>>)
+        {
+            const int unroll = (sizeof(value_type) <= 1 ? 16 : sizeof(value_type) <= 2 ? 8
+                                                           : sizeof(value_type) <= 4   ? 4
+                                                                                       : 2);
+
+            size_t i = 0;
+            for (; i + unroll <= size; i += unroll)
+            {
+                for (int u = 0; u < unroll; ++u)
+                {
+                    result += data_ptr[(i + u) * stride];
+                }
+            }
+
+            for (; i < size; ++i)
+            {
+                result += data_ptr[i * stride];
+            }
+        }
+        else
+        {
+            const int unroll = 8;
+            size_t i = 0;
+
+            for (; i + unroll <= size; i += unroll)
+            {
+                for (int u = 0; u < unroll; ++u)
+                {
+                    result |= data_ptr[(i + u) * stride];
+                }
+            }
+
+            for (; i < size; ++i)
+            {
+                result |= data_ptr[i * stride];
+            }
+        }
+    }
+
+    bool next_cartesian_product_prefix(small_vector<size_t> & idx,
+                                       const small_vector<size_t> & shape,
+                                       size_t prefix_len) const
+    {
+        for (size_t i = prefix_len; i > 0; --i)
+        {
+            if (++idx[i - 1] < shape[i - 1])
+            {
+                return true;
+            }
+            idx[i - 1] = 0;
+        }
+        return false;
+    }
+}; /* end class SimpleArrayMixinSum */
+
 template <typename A, typename T>
 class SimpleArrayMixinCalculators
 {
@@ -342,14 +588,8 @@ class SimpleArrayMixinCalculators
     value_type mean() const
     {
         auto athis = static_cast<A const *>(this);
-        auto sidx = athis->first_sidx();
-        value_type sum = 0;
-        int64_t total = 0;
-        do
-        {
-            sum += athis->at(sidx);
-            ++total;
-        } while (athis->next_sidx(sidx));
+        int64_t total = athis->size();
+        value_type sum = athis->sum();
         return sum / static_cast<value_type>(total);
     }
 
@@ -464,36 +704,6 @@ class SimpleArrayMixinCalculators
         return initial;
     }
 
-    value_type sum() const
-    {
-        value_type initial;
-        if constexpr (is_complex_v<value_type>)
-        {
-            initial = value_type();
-        }
-        else
-        {
-            initial = 0;
-        }
-
-        auto athis = static_cast<A const *>(this);
-        if constexpr (!std::is_same_v<bool, std::remove_const_t<value_type>>)
-        {
-            for (size_t i = 0; i < athis->size(); ++i)
-            {
-                initial += athis->data(i);
-            }
-        }
-        else
-        {
-            for (size_t i = 0; i < athis->size(); ++i)
-            {
-                initial |= athis->data(i);
-            }
-        }
-        return initial;
-    }
-
     A abs() const
     {
         auto athis = static_cast<A const *>(this);
@@ -1014,6 +1224,7 @@ class SimpleArrayMixinSearch
 template <typename T>
 class SimpleArray
     : public detail::SimpleArrayMixinModifiers<SimpleArray<T>, T>
+    , public detail::SimpleArrayMixinSum<SimpleArray<T>, T>
     , public detail::SimpleArrayMixinCalculators<SimpleArray<T>, T>
     , public detail::SimpleArrayMixinSort<SimpleArray<T>, T>
     , public detail::SimpleArrayMixinSearch<SimpleArray<T>, T>
@@ -1464,21 +1675,33 @@ class SimpleArray
     value_type const * body() const { return m_body; }
     value_type * body() { return m_body; }
 
+    bool is_c_contiguous() const { return is_c_contiguous(m_shape, m_stride); }
+
 private:
-    void check_c_contiguous(small_vector<size_t> const & shape,
-                            small_vector<size_t> const & stride) const
+    bool is_c_contiguous(small_vector<size_t> const & shape,
+                         small_vector<size_t> const & stride) const
     {
         if (stride[stride.size() - 1] != 1)
         {
-            throw std::runtime_error("SimpleArray: C contiguous stride must end with 1");
+            return false;
         }
         for (size_t it = 0; it < shape.size() - 1; ++it)
         {
             if (stride[it] != shape[it + 1] * stride[it + 1])
             {
-                throw std::runtime_error("SimpleArray: C contiguous stride must match shape");
+                return false;
             }
         }
+        return true;
+    }
+
+    void check_c_contiguous(small_vector<size_t> const & shape,
+                            small_vector<size_t> const & stride) const
+    {
+        if (!is_c_contiguous(shape, stride))
+        {
+            throw std::runtime_error("SimpleArray: C contiguous stride must match shape and end with 1");
+        }
     }
 
     void check_f_contiguous(small_vector<size_t> const & shape,
diff --git a/profiling/profile_statistic_ops.py b/profiling/profile_statistic_ops.py
index c0bb8aac5..c24596275 100644
--- a/profiling/profile_statistic_ops.py
+++ b/profiling/profile_statistic_ops.py
@@ -109,9 +109,10 @@ def profile_average_np(src):
 
 
 def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10,
-                    axis=None):
+                    axis=None, dims=None):
     axis_str = f" (axis={axis})" if axis is not None else ""
-    print(f"\n# {op} (dtype={dtype}){axis_str}")
+    dims_str = f" ({dims}D)" if dims is not None else ""
+    print(f"\n# {op} (dtype={dtype}){axis_str}{dims_str}")
 
     results = []
 
@@ -130,9 +131,29 @@ def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10,
                 else:
                     src = np.random.randint(-1000, 1000, shape, dtype=dtype)
             src_sa = make_container(src, dtype)
+        elif dims == 3:
+            cube_size = int(N ** (1/3))
+            if cube_size < 2:
+                cube_size = 2
+            larger_shape = (cube_size * 3, cube_size * 3, cube_size * 3)
+            if np.issubdtype(dtype, np.floating):
+                src = np.random.rand(*larger_shape).astype(dtype, copy=False)
+            else:
+                if dtype == np.int8:
+                    src = np.random.randint(
+                        -100, 100, larger_shape, dtype=dtype)
+                elif dtype == np.int32:
+                    src = np.random.randint(
+                        -1000, 1000, larger_shape, dtype=dtype)
+                else:
+                    src = np.random.randint(
+                        -1000, 1000, larger_shape, dtype=dtype)
+            src = src[::3, ::3, ::3]
+            shape = src.shape
+            src_sa = make_container(src, dtype)
         else:
             if np.issubdtype(dtype, np.floating):
-                src = np.random.rand(N).astype(dtype, copy=False)
+                src = np.random.rand(N * 5).astype(dtype, copy=False)
             else:
                 if dtype == np.int8:
                     src = np.random.randint(-100, 100, N, dtype=dtype)
@@ -142,7 +163,7 @@ def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10,
                     src = np.random.randint(-1000, 1000, N, dtype=dtype)
             src_sa = make_container(src, dtype)
         modmesh.call_profiler.reset()
-        for _ in range(it):
+        for i in range(it):
             if axis is not None:
                 prof_func_np(src, axis=axis)
                 prof_func_sa(src_sa, axis=axis)
@@ -159,14 +180,22 @@ def profile_stat_op(op, prof_func_np, prof_func_sa, dtype, sizes, it=10,
         result_row = {
             'operation': op,
             'dtype': str(dtype),
-            'size': N if axis is None else f"{shape[0]}x{shape[1]}",
+            'size': N if axis is None and dims is None
+            else f"{shape[0]}x{shape[1]}" if axis is not None
+            else f"{shape[0]}x{shape[1]}x{shape[2]}" if dims == 3 else N,
             'size_numeric': N,
-            'axis': axis
+            'axis': axis,
+            'dims': dims
         }
         result_row.update(out)
         results.append(result_row)
 
-        print(f"## N = {N if axis is None else shape}")
+        if axis is not None:
+            print(f"## N = {shape}")
+        elif dims == 3:
+            print(f"## N = {shape}")
+        else:
+            print(f"## N = {N}")
 
         def print_row(*cols):
             print(str.format("| {:10s} | {:15s} | {:15s} |", *(cols[0:3])))
@@ -242,6 +271,7 @@ def create_performance_plots(all_results):
     for op in operations:
         create_1d_performance_plot(all_results, op, dtypes)
         create_axis_performance_plot(all_results, op, dtypes)
+        create_3d_performance_plot(all_results, op, dtypes)
 
 
 def create_1d_performance_plot(all_results, op, dtypes):
@@ -255,7 +285,9 @@ def create_1d_performance_plot(all_results, op, dtypes):
         axes = axes.reshape(1, -1)
 
     op_data = [r for r in all_results
-               if r['operation'] == op and r['axis'] is None]
+               if r['operation'] == op
+               and r['axis'] is None
+               and r.get('dims') is None]
 
     for i, dtype in enumerate(sorted_dtypes):
         dtype_data = [r for r in op_data if r['dtype'] == dtype]
@@ -366,12 +398,63 @@ def create_axis_performance_plot(all_results, op, dtypes):
     plt.close()
 
 
+def create_3d_performance_plot(all_results, op, dtypes):
+    sorted_dtypes = sorted(dtypes)
+    n_dtypes = len(sorted_dtypes)
+
+    fig, axes = plt.subplots(n_dtypes, 2, figsize=(15, 4*n_dtypes))
+    fig.suptitle(f'3D Performance: {op.title()}', fontsize=16)
+
+    if n_dtypes == 1:
+        axes = axes.reshape(1, -1)
+
+    op_data = [r for r in all_results
+               if r['operation'] == op and r['dims'] == 3]
+
+    for i, dtype in enumerate(sorted_dtypes):
+        dtype_data = [r for r in op_data if r['dtype'] == dtype]
+
+        sizes = [r['size_numeric'] for r in dtype_data]
+        np_times = [r['np'] for r in dtype_data]
+        sa_times = [r['sa'] for r in dtype_data]
+
+        ax1 = axes[i, 0]
+        ax1.plot(sizes, np_times, 'o-', label='NumPy', color='blue')
+        ax1.plot(sizes, sa_times, 's-', label='SimpleArray', color='red')
+        ax1.set_xlabel('Array Size')
+        ax1.set_ylabel('Time per Call (ms)')
+        ax1.set_title(f'{dtype} - 3D Performance')
+        ax1.set_xscale('log')
+        ax1.set_yscale('log')
+        ax1.legend()
+        ax1.grid(True, alpha=0.3)
+
+        ax2 = axes[i, 1]
+        speedups = [r['np'] / r['sa'] for r in dtype_data]
+        ax2.plot(sizes, speedups, 'D-', color='green')
+        ax2.set_xlabel('Array Size')
+        ax2.set_ylabel('Speedup (NumPy / SimpleArray)')
+        ax2.set_title(f'{dtype} - 3D Speedup')
+        ax2.set_xscale('log')
+        ax2.axhline(y=1, color='black', linestyle='--', alpha=0.5)
+        ax2.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig('profiling/results/png/'
+                + 'performance_3d_non_contiguous_'
+                + f'{op}.png',
+                dpi=300, bbox_inches='tight')
+    plt.close()
+
+
 def main():
+    print("Starting performance profiling...")
     sizes = [10, 10**3, 10**6]
     dtypes = [np.float64, np.float32, np.int64, np.int32, np.int8]
     all_results = []
 
     for dtype in dtypes:
+        print(f"Testing dtype: {dtype}")
         operations_1d = [
             ("median", profile_median_np, profile_median_sa),
             ("mean", profile_mean_np, profile_mean_sa),
@@ -399,6 +482,20 @@ def main():
                     op, prof_func_np, prof_func_sa, dtype, sizes, axis=axis)
                 all_results.extend(results)
 
+        # 3D without axis tests
+        operations_3d = [
+            ("median", profile_median_np, profile_median_sa),
+            ("mean", profile_mean_np, profile_mean_sa),
+            ("var", profile_var_np, profile_var_sa),
+            ("std", profile_std_np, profile_std_sa),
+            ("average", profile_average_np, profile_average_sa)
+        ]
+
+        for op, prof_func_np, prof_func_sa in operations_3d:
+            results = profile_stat_op(
+                op, prof_func_np, prof_func_sa, dtype, sizes, dims=3)
+            all_results.extend(results)
+
     create_performance_plots(all_results)
 
 
diff --git a/tests/test_buffer.py b/tests/test_buffer.py
index 38165d03c..7d414e998 100644
--- a/tests/test_buffer.py
+++ b/tests/test_buffer.py
@@ -986,6 +986,12 @@ def test_minmaxsum(self):
         self.assertEqual(sarr.min(), -2.3)
         self.assertEqual(sarr.max(), 9.2)
 
+    def test_sum_non_contiguous(self):
+        nparr = np.arange(625, dtype='float64').reshape((5, 5, 5, 5))
+        nparr = nparr[:3:2, 1:4:2, :3:3, 3:4:2]
+        sarr = modmesh.SimpleArrayFloat64(array=nparr)
+        self.assertEqual(sarr.sum(), np.sum(nparr))
+
     def test_abs(self):
         sarr = modmesh.SimpleArrayInt64(shape=(3, 2), value=-2)
         self.assertEqual(sarr.sum(), -2 * 3 * 2)