hmella
diff --git a/‎CMakeLists.txt‎
Lines changed: 31 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎cpp/feelmri/Assemble.cpp‎
Lines changed: 144 additions & 0 deletions b/‎cpp/feelmri/Assemble.cpp‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎cpp/feelmri/Assemble.h‎
Lines changed: 37 additions & 0 deletions b/‎cpp/feelmri/Assemble.h‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎cpp/feelmri/BlochSimulator.cpp‎
Lines changed: 6 additions & 0 deletions b/‎cpp/feelmri/BlochSimulator.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/feelmri/BlochSimulator.h‎
Lines changed: 2 additions & 0 deletions b/‎cpp/feelmri/BlochSimulator.h‎
Lines changed: 2 additions & 0 deletions
@@ -96,6 +96,29 @@ endif()
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
 find_package(pybind11 CONFIG REQUIRED)
 
+# --- Basix: use the basix that is installed in this Python env (fenics-basix wheel) ---
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c
+    "import basix, os; print(os.path.join(os.path.dirname(basix.__file__), 'lib'))"
+  OUTPUT_VARIABLE BASIX_PY_LIBDIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  RESULT_VARIABLE _basix_rc
+)
+if(NOT _basix_rc EQUAL 0)
+  message(FATAL_ERROR
+    "Basix not importable in this Python env. Install: python -m pip install fenics-basix")
+endif()
+
+find_library(BASIX_LIB
+  NAMES basix
+  PATHS "${BASIX_PY_LIBDIR}"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+message(STATUS "Basix wheel libdir: ${BASIX_PY_LIBDIR}")
+message(STATUS "Basix wheel lib:    ${BASIX_LIB}")
+
 # ------------------------------------------------------------
 # Eigen3 automatic installation (cached and robust)
 # ------------------------------------------------------------
@@ -130,7 +153,7 @@ endif()
 # Source setup (core modules)
 # ------------------------------------------------------------
 set(SRC_DIR cpp/feelmri)
-set(MODULES Assemble BlochSimulator MRI POD)
+set(MODULES MRIAssemble Assemble BlochSimulator PODHelper)
 
 foreach(mod ${MODULES})
     message(STATUS "Building module: ${mod}")
@@ -145,7 +168,9 @@ foreach(mod ${MODULES})
             ${Python_INCLUDE_DIRS}
     )
 
-    # Apply high-performance compile flags
+		target_link_libraries(${mod} PRIVATE "${BASIX_LIB}")
+
+		# Apply high-performance compile flags
     target_compile_options(${mod} PRIVATE
         -Ofast
         -ffast-math
@@ -154,6 +179,10 @@ foreach(mod ${MODULES})
         -ffp-contract=fast
         -fvisibility=hidden
         -fPIC
+        -funroll-loops
+        -fopenmp-simd
+        -mavx2
+        -mfma
         -DNDEBUG
         -DEIGEN_NO_DEBUG
         -DEIGEN_FAST_MATH
 
@@ -1,4 +1,13 @@
 #include "Assemble.h"
+#include "FEUtils.h"
+#include <pybind11/pybind11.h>
+#include <pybind11/eigen.h>
+#include <pybind11/stl.h>
+#include <string>
+#include <vector>
+#include <tuple>
+#include <cmath>
+#include <span>
 
 namespace py = pybind11;
 
@@ -114,3 +123,138 @@ Eigen::SparseMatrix<T> MassAssemble(
     M.setFromTriplets(coefficients.begin(), coefficients.end());
     return M;
 }
+
+
+template <typename T, std::size_t d>
+using mdspan_t = basix::md::mdspan<T, basix::md::dextents<std::size_t, d>>;
+
+// -----------------------------------------------------------------------------
+// Local mass assembly using *pre-tabulated* basis values/derivatives
+// -----------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
+// Optimized Local mass assembly
+// -----------------------------------------------------------------------------
+template <typename T>
+static inline void basixLocalMassAssemble(
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& Me,       // Passed by reference (no alloc)
+    const Eigen::Matrix<T, Eigen::Dynamic, 3>& elem_nodes,      // Forced 3 columns (no alloc)
+    const std::vector<T>& wts,
+    const mdspan_t<const T, 4>& tab,
+    int nb_dofs,
+    int nq)
+{
+    using Vec = Eigen::Matrix<T, Eigen::Dynamic, 1>;
+    using Mat3 = Eigen::Matrix<T, 3, 3>;
+
+    Me.setZero(); // Reset thread-local buffer
+
+    for (int q = 0; q < nq; ++q)
+    {
+        // Zero-copy mapping directly from Basix tabulation. 
+        // Assumes C-style contiguous memory layout where 'dofs' vary fastest after 'value_size' (1).
+        Eigen::Map<const Vec> S   (&tab(0, q, 0, 0), nb_dofs);
+        Eigen::Map<const Vec> dSdr(&tab(1, q, 0, 0), nb_dofs);
+        Eigen::Map<const Vec> dSds(&tab(2, q, 0, 0), nb_dofs);
+        Eigen::Map<const Vec> dSdt(&tab(3, q, 0, 0), nb_dofs);
+
+        // Build Jacobian
+        Mat3 J;
+        J.row(0) = dSdr.transpose() * elem_nodes;
+        J.row(1) = dSds.transpose() * elem_nodes;
+        J.row(2) = dSdt.transpose() * elem_nodes;
+
+        const T detJ = std::abs(J.determinant());
+        const T c = detJ * wts[q];
+
+        // Rank-1 update. .noalias() prevents Eigen from creating a temporary matrix
+        Me.noalias() += c * (S * S.transpose());
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Optimized Global Mass assemble
+// -----------------------------------------------------------------------------
+template <typename T>
+Eigen::SparseMatrix<T> basixMassAssemble(
+    const Eigen::MatrixXi& elems,
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& nodes,
+    const std::string& meshio_type,
+    const std::string& /*quadrature_variant*/,
+    const std::string& /*quadrature_rule*/,
+    const int quadrature_degree)
+{
+    const int nb_elems   = elems.rows();
+    const int nb_nodes   = nodes.rows();
+    const int nb_nodes_e = elems.cols();
+
+    // Basix setup
+    const auto fe_info = get_fe_info(meshio_type);
+    const auto variant = basix::element::lagrange_variant::equispaced;
+
+    const basix::FiniteElement<T> finite_element =
+        basix::create_element<T>(
+            fe_info.family, fe_info.cell, fe_info.degree, variant,
+            basix::element::dpc_variant::unset, false);
+
+    const int nb_dofs = finite_element.dim();
+
+    auto qw = basix::quadrature::make_quadrature<T>(
+        basix::quadrature::type::Default, fe_info.cell,
+        basix::polyset::type::standard, quadrature_degree);
+
+    const std::vector<T>& qpts_flat = qw[0];
+    const std::vector<T>& wts       = qw[1];
+
+    const std::size_t gdim = 3;
+    const int nq = static_cast<int>(wts.size());
+
+    auto [tab_data, tab_shape] =
+        finite_element.tabulate(
+            1, std::span<const T>(qpts_flat.data(), qpts_flat.size()),
+            {static_cast<std::size_t>(nq), gdim});
+
+    mdspan_t<const T, 4> tab(tab_data.data(), tab_shape);
+
+    Eigen::SparseMatrix<T> M(nb_nodes, nb_nodes);
+    using TripletType = Eigen::Triplet<T>;
+
+    // Pre-allocate the EXACT size required to avoid resizing and allow threaded assignment
+    const std::size_t total_triplets = static_cast<std::size_t>(nb_elems) * nb_nodes_e * nb_nodes_e;
+    std::vector<TripletType> coefficients(total_triplets);
+
+    // Multithread the assembly loop
+    #pragma omp parallel
+    {
+        // Thread-local buffers avoid heap allocations inside the hot loop
+        Eigen::Matrix<T, Eigen::Dynamic, 3> elem_nodes(nb_nodes_e, 3);
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> Me(nb_dofs, nb_dofs);
+
+        #pragma omp for
+        for (int e = 0; e < nb_elems; ++e)
+        {
+            const auto elem = elems.row(e);
+
+            for (int i = 0; i < nb_nodes_e; ++i)
+                elem_nodes.row(i) = nodes.row(elem(i));
+
+            // Populate thread-local Me
+            basixLocalMassAssemble<T>(Me, elem_nodes, wts, tab, nb_dofs, nq);
+
+            // Compute precise offset for lock-free parallel insertion
+            const std::size_t offset = static_cast<std::size_t>(e) * nb_nodes_e * nb_nodes_e;
+            std::size_t idx = 0;
+
+            for (int i = 0; i < nb_nodes_e; ++i)
+            {
+                for (int j = 0; j < nb_nodes_e; ++j)
+                {
+                    coefficients[offset + idx++] = TripletType(elem(i), elem(j), Me(i, j));
+                }
+            }
+        }
+    } // implicit OpenMP barrier sync
+
+    // Build sparse matrix
+    M.setFromTriplets(coefficients.begin(), coefficients.end());
+    return M;
+}
@@ -1,11 +1,19 @@
 #pragma once
+
 #include <pybind11/pybind11.h>
 #include <pybind11/eigen/tensor.h>
 #include <pybind11/eigen.h>
 #include <pybind11/stl.h>
 #include <Eigen/Dense>
 #include <Eigen/Sparse>
 
+#include <basix/finite-element.h>
+#include <basix/quadrature.h>
+#include <basix/mdspan.hpp>
+#include <basix/cell.h>
+#include <basix/element-families.h>
+#include <basix/polyset.h>
+
 namespace py = pybind11;
 
 // MassAssemble
@@ -17,6 +25,17 @@ Eigen::SparseMatrix<T> MassAssemble(
     const py::object &quadrature_rule
 );
 
+// MassAssemble
+template <typename T>
+Eigen::SparseMatrix<T> basixMassAssemble(
+    const Eigen::MatrixXi &elems,
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> &nodes,
+    const std::string &meshio_type,
+    const std::string &quadrature_variant,
+    const std::string &quadrature_rule,
+    const int quadrature_degree
+);
+
 
 // PYBIND11 module
 PYBIND11_MODULE(Assemble, m)
@@ -36,4 +55,22 @@ PYBIND11_MODULE(Assemble, m)
         const py::object &,
         const py::object &
     >(&MassAssemble<double>));
+
+    m.def("basixMassAssemble", py::overload_cast<
+        const Eigen::MatrixXi &,
+        const Eigen::MatrixXf &,
+        const std::string &,
+        const std::string &,
+        const std::string &,
+        const int
+    >(&basixMassAssemble<float>));
+
+    m.def("basixMassAssemble", py::overload_cast<
+        const Eigen::MatrixXi &,
+        const Eigen::MatrixXd &,
+        const std::string &,
+        const std::string &,
+        const std::string &,
+        const int
+    >(&basixMassAssemble<double>));
 }
@@ -1,4 +1,10 @@
 #include "BlochSimulator.h"
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/complex.h>
+#include <complex>
+#include <cmath>
+#include <utility>
 
 using namespace Eigen;
 namespace py = pybind11;
 
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <pybind11/eigen/tensor.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/eigen.h>
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#pragma once`
	`2`	`+`
`1`	`3`	`#include <pybind11/eigen/tensor.h>`
`2`	`4`	`#include <pybind11/pybind11.h>`
`3`	`5`	`#include <pybind11/eigen.h>`