11#include " Assemble.h"
2+ #include " FEUtils.h"
3+ #include < pybind11/pybind11.h>
4+ #include < pybind11/eigen.h>
5+ #include < pybind11/stl.h>
6+ #include < string>
7+ #include < vector>
8+ #include < tuple>
9+ #include < cmath>
10+ #include < span>
211
312namespace py = pybind11;
413
@@ -114,3 +123,138 @@ Eigen::SparseMatrix<T> MassAssemble(
114123 M.setFromTriplets (coefficients.begin (), coefficients.end ());
115124 return M;
116125}
126+
127+
128+ template <typename T, std::size_t d>
129+ using mdspan_t = basix::md::mdspan<T, basix::md::dextents<std::size_t , d>>;
130+
131+ // -----------------------------------------------------------------------------
132+ // Local mass assembly using *pre-tabulated* basis values/derivatives
133+ // -----------------------------------------------------------------------------
134+ // -----------------------------------------------------------------------------
135+ // Optimized Local mass assembly
136+ // -----------------------------------------------------------------------------
137+ template <typename T>
138+ static inline void basixLocalMassAssemble (
139+ Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& Me, // Passed by reference (no alloc)
140+ const Eigen::Matrix<T, Eigen::Dynamic, 3 >& elem_nodes, // Forced 3 columns (no alloc)
141+ const std::vector<T>& wts,
142+ const mdspan_t <const T, 4 >& tab,
143+ int nb_dofs,
144+ int nq)
145+ {
146+ using Vec = Eigen::Matrix<T, Eigen::Dynamic, 1 >;
147+ using Mat3 = Eigen::Matrix<T, 3 , 3 >;
148+
149+ Me.setZero (); // Reset thread-local buffer
150+
151+ for (int q = 0 ; q < nq; ++q)
152+ {
153+ // Zero-copy mapping directly from Basix tabulation.
154+ // Assumes C-style contiguous memory layout where 'dofs' vary fastest after 'value_size' (1).
155+ Eigen::Map<const Vec> S (&tab (0 , q, 0 , 0 ), nb_dofs);
156+ Eigen::Map<const Vec> dSdr (&tab (1 , q, 0 , 0 ), nb_dofs);
157+ Eigen::Map<const Vec> dSds (&tab (2 , q, 0 , 0 ), nb_dofs);
158+ Eigen::Map<const Vec> dSdt (&tab (3 , q, 0 , 0 ), nb_dofs);
159+
160+ // Build Jacobian
161+ Mat3 J;
162+ J.row (0 ) = dSdr.transpose () * elem_nodes;
163+ J.row (1 ) = dSds.transpose () * elem_nodes;
164+ J.row (2 ) = dSdt.transpose () * elem_nodes;
165+
166+ const T detJ = std::abs (J.determinant ());
167+ const T c = detJ * wts[q];
168+
169+ // Rank-1 update. .noalias() prevents Eigen from creating a temporary matrix
170+ Me.noalias () += c * (S * S.transpose ());
171+ }
172+ }
173+
174+ // -----------------------------------------------------------------------------
175+ // Optimized Global Mass assemble
176+ // -----------------------------------------------------------------------------
177+ template <typename T>
178+ Eigen::SparseMatrix<T> basixMassAssemble (
179+ const Eigen::MatrixXi& elems,
180+ const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& nodes,
181+ const std::string& meshio_type,
182+ const std::string& /* quadrature_variant*/ ,
183+ const std::string& /* quadrature_rule*/ ,
184+ const int quadrature_degree)
185+ {
186+ const int nb_elems = elems.rows ();
187+ const int nb_nodes = nodes.rows ();
188+ const int nb_nodes_e = elems.cols ();
189+
190+ // Basix setup
191+ const auto fe_info = get_fe_info (meshio_type);
192+ const auto variant = basix::element::lagrange_variant::equispaced;
193+
194+ const basix::FiniteElement<T> finite_element =
195+ basix::create_element<T>(
196+ fe_info.family , fe_info.cell , fe_info.degree , variant,
197+ basix::element::dpc_variant::unset, false );
198+
199+ const int nb_dofs = finite_element.dim ();
200+
201+ auto qw = basix::quadrature::make_quadrature<T>(
202+ basix::quadrature::type::Default, fe_info.cell ,
203+ basix::polyset::type::standard, quadrature_degree);
204+
205+ const std::vector<T>& qpts_flat = qw[0 ];
206+ const std::vector<T>& wts = qw[1 ];
207+
208+ const std::size_t gdim = 3 ;
209+ const int nq = static_cast <int >(wts.size ());
210+
211+ auto [tab_data, tab_shape] =
212+ finite_element.tabulate (
213+ 1 , std::span<const T>(qpts_flat.data (), qpts_flat.size ()),
214+ {static_cast <std::size_t >(nq), gdim});
215+
216+ mdspan_t <const T, 4 > tab (tab_data.data (), tab_shape);
217+
218+ Eigen::SparseMatrix<T> M (nb_nodes, nb_nodes);
219+ using TripletType = Eigen::Triplet<T>;
220+
221+ // Pre-allocate the EXACT size required to avoid resizing and allow threaded assignment
222+ const std::size_t total_triplets = static_cast <std::size_t >(nb_elems) * nb_nodes_e * nb_nodes_e;
223+ std::vector<TripletType> coefficients (total_triplets);
224+
225+ // Multithread the assembly loop
226+ #pragma omp parallel
227+ {
228+ // Thread-local buffers avoid heap allocations inside the hot loop
229+ Eigen::Matrix<T, Eigen::Dynamic, 3 > elem_nodes (nb_nodes_e, 3 );
230+ Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> Me (nb_dofs, nb_dofs);
231+
232+ #pragma omp for
233+ for (int e = 0 ; e < nb_elems; ++e)
234+ {
235+ const auto elem = elems.row (e);
236+
237+ for (int i = 0 ; i < nb_nodes_e; ++i)
238+ elem_nodes.row (i) = nodes.row (elem (i));
239+
240+ // Populate thread-local Me
241+ basixLocalMassAssemble<T>(Me, elem_nodes, wts, tab, nb_dofs, nq);
242+
243+ // Compute precise offset for lock-free parallel insertion
244+ const std::size_t offset = static_cast <std::size_t >(e) * nb_nodes_e * nb_nodes_e;
245+ std::size_t idx = 0 ;
246+
247+ for (int i = 0 ; i < nb_nodes_e; ++i)
248+ {
249+ for (int j = 0 ; j < nb_nodes_e; ++j)
250+ {
251+ coefficients[offset + idx++] = TripletType (elem (i), elem (j), Me (i, j));
252+ }
253+ }
254+ }
255+ } // implicit OpenMP barrier sync
256+
257+ // Build sparse matrix
258+ M.setFromTriplets (coefficients.begin (), coefficients.end ());
259+ return M;
260+ }
0 commit comments