diff --git a/include/permanent/combinatoric.h b/include/permanent/combinatoric.h
index a367067..9004166 100644
--- a/include/permanent/combinatoric.h
+++ b/include/permanent/combinatoric.h
@@ -5,18 +5,20 @@
 
 #include <permanent/common.h>
 #include <permanent/complex.h>
-#include <permanent/kperm-gray.h>
-#include <permanent/perm-mv0.h>
+#include <permanent/fxt/kperm-gray.h>
+#include <permanent/fxt/perm-mv0.h>
 
 namespace permanent {
 
 template <typename T, typename I = void>
-result_t<T, I> combinatoric_square(const size_t m, const size_t n, const T *ptr)
+result_t<T, I> combinatoric_square(const size_t m, const size_t, const T *ptr)
 {
-  (void)n;
-
   perm_mv0 permutations(m);
 
+  if (m == 1) {
+    return ptr[0];
+  }
+
   result_t<T, I> out = 0;
   const size_t *perm = permutations.data();
   do {
@@ -58,4 +60,4 @@ result_t<T, I> combinatoric(const size_t m, const size_t n, const T *ptr)
 
 }  // namespace permanent
 
-#endif  // permanent_combinatoric_h_
+#endif  // permanent_combinatoric_h_
\ No newline at end of file
diff --git a/include/permanent/common.h b/include/permanent/common.h
index 7654964..a8005dd 100644
--- a/include/permanent/common.h
+++ b/include/permanent/common.h
@@ -3,8 +3,9 @@
 #if !defined(permanent_common_h_)
 #define permanent_common_h_
 
-#include <complex>
 #include <cstdint>
+
+#include <complex>
 #include <type_traits>
 
 namespace permanent {
@@ -12,6 +13,7 @@ namespace permanent {
 // Numerical types
 
 using size_t = std::size_t;
+using ptrdiff_t = std::ptrdiff_t;
 
 using sgn_t = std::int_fast8_t;
 
@@ -51,4 +53,4 @@ using result_t = _result_t<Type, IntType>::type;
 
 }  // namespace permanent
 
-#endif  // permanent_common_h_
+#endif  // permanent_common_h_
\ No newline at end of file
diff --git a/include/permanent/complex.h b/include/permanent/complex.h
index a6e5f7d..6191ea8 100644
--- a/include/permanent/complex.h
+++ b/include/permanent/complex.h
@@ -18,21 +18,21 @@ struct _identity_t
 template <typename T>
 using identity_t = _identity_t<T>::type;
 
-}  // namespace permanent
-
-#define DEF_COMPLEX_OPS(OP)                                                                \
-                                                                                           \
-  template <typename T>                                                                    \
-  std::complex<T> operator OP(const std::complex<T> &x, const permanent::identity_t<T> &y) \
-  {                                                                                        \
-    return x OP y;                                                                         \
-  }                                                                                        \
-                                                                                           \
-  template <typename T>                                                                    \
-  std::complex<T> operator OP(const permanent::identity_t<T> &x, const std::complex<T> &y) \
-  {                                                                                        \
-    return x OP y;                                                                         \
-  }
+namespace complex_ops {
+
+#define DEF_COMPLEX_OPS(OP)                                                                     \
+                                                                                                \
+template <typename T>                                                                           \
+inline std::complex<T> operator OP(const std::complex<T> &x, const permanent::identity_t<T> &y) \
+{                                                                                               \
+  return operator OP(x, y);                                                                     \
+}                                                                                               \
+                                                                                                \
+template <typename T>                                                                           \
+inline std::complex<T> operator OP(const permanent::identity_t<T> &x, const std::complex<T> &y) \
+{                                                                                               \
+  return operator OP(x, y);                                                                     \
+}
 
 DEF_COMPLEX_OPS(+)
 DEF_COMPLEX_OPS(-)
@@ -41,4 +41,8 @@ DEF_COMPLEX_OPS(/)
 
 #undef DEF_COMPLEX_OPS
 
-#endif  // PERMANENT_COMPLEX_H_
+} // namespace complex_ops
+
+}  // namespace permanent
+
+#endif  // PERMANENT_COMPLEX_H_
\ No newline at end of file
diff --git a/include/permanent/fxt/kperm-gray.h b/include/permanent/fxt/kperm-gray.h
new file mode 100644
index 0000000..c50cc78
--- /dev/null
+++ b/include/permanent/fxt/kperm-gray.h
@@ -0,0 +1,152 @@
+#if !defined HAVE_KPERM_GRAY_H__
+#define HAVE_KPERM_GRAY_H__
+// This file is part of the FXT library.
+// Copyright (C) 2010, 2012, 2013, 2014, 2019, 2023 Joerg Arndt
+// License: GNU General Public License version 3 or later,
+// see the file COPYING.txt in the main directory.
+
+#include <cstdlib>
+
+#include <permanent/fxt/swap.h>
+
+class kperm_gray
+// Gray code for k-permutations of n elements.
+// Same as: k-prefixes of permutations of n elements.
+// Same as: arrangements of k out of n elements.
+// CAT algorithm based on mixed radix Gray code
+//   for the factorial number system (falling base).
+{
+ protected:
+  std::size_t d_[64];      // mixed radix digits with radix = [n-1, n-2, ..., 2]
+  std::size_t i_[64];      // directions
+  std::size_t ix_[64];     // permutation (inverse perms in Trotter's order)
+  std::size_t x_[64];      // inverse permutation (perms in Trotter's order)
+  std::size_t n_;          // total of n elements
+  std::size_t k_;          // prefix length: permutations of k elements
+  std::size_t sw1_, sw2_;  // indices of elements swapped most recently
+
+  kperm_gray(const kperm_gray &) = delete;
+  kperm_gray &operator=(const kperm_gray &) = delete;
+
+ public:
+  explicit kperm_gray(std::size_t n)
+  {
+    n_ = n;
+    k_ = n;
+    // d_ = new std::size_t[n_];
+    d_[n - 1] = -1UL;  // sentinel
+    // i_ = new std::size_t[n_];
+    // x_ = new std::size_t[n_];
+    // ix_ = new std::size_t[n_];
+    i_[n_ - 1] = 0;
+    first(n_);
+  }
+
+  ~kperm_gray()
+  {
+    // delete [] i_;
+    // delete [] d_;
+    // delete [] x_;
+    // delete [] ix_;
+  }
+
+  const std::size_t *data() const { return ix_; }
+  const std::size_t *invdata() const { return x_; }
+  void get_swap(std::size_t &s1, std::size_t &s2) const
+  {
+    s1 = sw1_;
+    s2 = sw2_;
+  }
+
+  const std::size_t *mr_digits() const { return d_; }
+
+  void first(std::size_t k)
+  {
+    k_ = k;
+    for (std::size_t j = 0; j < n_ - 1; ++j) d_[j] = 0;
+    for (std::size_t j = 0; j < n_ - 1; ++j) i_[j] = +1;
+    for (std::size_t j = 0; j < n_; ++j) x_[j] = ix_[j] = j;
+    sw1_ = n_ - 1;
+    sw2_ = n_ - 2;
+  }
+
+  //    void last(std::size_t k)
+  //    {
+  //        first(k);  // k_ is set with call
+  //        d_[n_-2] = 1;
+  //        swap2(x_[n_-1], x_[n_-2]);
+  //        swap2(ix_[n_-1], ix_[n_-2]);
+  //        for (std::size_t j=0; j<n_-2; ++j)  i_[j] = -1UL;
+  //        i_[n_-2] = +1;
+  //    }
+
+ private:
+  void swap(std::size_t j, std::size_t im)
+  {
+    const std::size_t x1 = j;        // element j
+    const std::size_t i1 = ix_[x1];  // position of j
+    const std::size_t i2 = i1 + im;  // neighbor
+    const std::size_t x2 = x_[i2];   // position of neighbor
+    x_[i1] = x2;
+    x_[i2] = x1;  // swap2(x_[i1], x_[i2]);
+    ix_[x1] = i2;
+    ix_[x2] = i1;  // swap2(ix_[x1], ix_[x2]);
+    sw1_ = i1;
+    sw2_ = i2;
+  }
+
+ public:
+  bool next()
+  {
+    std::size_t j = 0;
+    std::size_t m1 = n_ - 1;  // nine in falling factorial base
+    std::size_t ij;
+    while ((ij = i_[j])) {
+      std::size_t im = i_[j];
+      std::size_t dj = d_[j] + im;
+      if (dj > m1)  // =^= if ( (dj>m1) || ((long)dj<0) )
+      {
+        i_[j] = -ij;
+      } else {
+        d_[j] = dj;
+        swap(j, im);
+        return true;
+      }
+
+      --m1;
+      ++j;
+      if (j >= k_) return false;
+    }
+    return false;
+  }
+
+  //    bool prev()
+  //    {
+  //        std::size_t j = 0;
+  //        std::size_t m1 = n_ - 1;
+  //        std::size_t ij;
+  //        while ( (ij=i_[j]) )
+  //        {
+  //            std::size_t im = -i_[j];
+  //            std::size_t dj = d_[j] + im;
+  //            if ( dj>m1 )  // =^= if ( (dj>m1) || ((long)dj<0) )
+  //            {
+  //                i_[j] = -ij;
+  //            }
+  //            else
+  //            {
+  //                d_[j] = dj;
+  //                swap(j, im);
+  //                return true;
+  //            }
+  //
+  //            --m1;
+  //            ++j;
+  //            if ( j>=k_ )  return false;
+  //        }
+  //        return false;
+  //    }
+};
+// -------------------------
+
+#endif  // !defined HAVE_KPERM_GRAY_H__
\ No newline at end of file
diff --git a/include/permanent/fxt/perm-mv0.h b/include/permanent/fxt/perm-mv0.h
new file mode 100644
index 0000000..2a7ea79
--- /dev/null
+++ b/include/permanent/fxt/perm-mv0.h
@@ -0,0 +1,120 @@
+#if !defined HAVE_PERM_MV0_H__
+#define HAVE_PERM_MV0_H__
+// This file is part of the FXT library.
+// Copyright (C) 2010, 2011, 2012, 2014, 2019, 2023 Joerg Arndt
+// License: GNU General Public License version 3 or later,
+// see the file COPYING.txt in the main directory.
+
+#include <cstdlib>
+
+#include <permanent/fxt/swap.h>
+
+// define to update d[0] with each step:
+#define PERM_MV0_UPDATE_D0  // default is on
+
+// whether to use arrays instead of pointers:
+#define PERM_MV0_FIXARRAYS  // small speedup; default off
+
+class perm_mv0
+// Inverse permutations corresponding to falling factorial numbers.
+// CAT algorithm based on mixed radix Gray code
+//   for the factorial number system (falling base).
+{
+ public:
+#if !defined PERM_MV0_FIXARRAYS
+  std::size_t *d_;  // mixed radix digits with radix = [n-1, n-2, n-3, ..., 2]
+  std::size_t *x_;  // permutation
+#else
+  std::size_t d_[64];
+  std::size_t x_[64];
+#endif
+  std::size_t ect_;  // counter for easy case
+  std::size_t n_;    // permutations of n elements
+
+  perm_mv0(const perm_mv0 &) = delete;
+  perm_mv0 &operator=(const perm_mv0 &) = delete;
+
+ public:
+  explicit perm_mv0(std::size_t n)
+  // Must have n>=2
+  {
+    n_ = n;
+#if !defined PERM_MV0_FIXARRAYS
+    d_ = new std::size_t[n_];
+    x_ = new std::size_t[n_];
+#endif
+    d_[n - 1] = 1;  // sentinel (must be nonzero)
+    first();
+  }
+
+  ~perm_mv0()
+  {
+#if !defined PERM_MV0_FIXARRAYS
+    delete[] d_;
+    delete[] x_;
+#endif
+  }
+
+  const std::size_t *data() const { return x_; }
+
+  void first()
+  {
+    for (std::size_t k = 0; k < n_; ++k) x_[k] = k;
+    for (std::size_t k = 0; k < n_ - 1; ++k) d_[k] = 0;
+    ect_ = 0;
+  }
+
+  bool next()
+  {
+    if (++ect_ < n_)  // easy case
+    {
+#if defined PERM_MV0_UPDATE_D0
+      d_[0] = ect_;
+#endif
+      swap2(x_[ect_], x_[ect_ - 1]);
+      return true;
+    } else {
+      ect_ = 0;
+#if defined PERM_MV0_UPDATE_D0
+      d_[0] = ect_;
+#endif
+
+      std::size_t j = 1;
+      std::size_t m1 = n_ - 2;  // nine in falling factorial base
+      while (d_[j] == m1)       // find digit to increment
+      {
+        d_[j] = 0;
+        --m1;
+        ++j;
+      }
+
+      if (j == n_ - 1) return false;  // current permutation is last
+
+      const std::size_t dj = d_[j];
+      d_[j] = dj + 1;
+
+      // element at d[j] moves one position to the right:
+      swap2(x_[dj], x_[dj + 1]);
+
+      {  // move n-j elements to end:
+        std::size_t s = n_ - j, d = n_;
+        do {
+          --s;
+          --d;
+          x_[d] = x_[s];
+        } while (s);
+      }
+
+      // fill in 0,1,2,..,j-1 at start:
+      for (std::size_t k = 0; k < j; ++k) x_[k] = k;
+
+      return true;
+    }
+  }
+};
+// -------------------------
+
+// #undef PERM_MV0_UPDATE_D0
+// #undef PERM_MV0_FIXARRAYS
+
+#endif  // !defined HAVE_PERM_MV0_H__
\ No newline at end of file
diff --git a/include/permanent/fxt/swap.h b/include/permanent/fxt/swap.h
new file mode 100644
index 0000000..f08cc62
--- /dev/null
+++ b/include/permanent/fxt/swap.h
@@ -0,0 +1,25 @@
+#if !defined HAVE_SWAP_H__
+#define HAVE_SWAP_H__
+// This file is part of the FXT library.
+// Copyright (C) 2010 Joerg Arndt
+// License: GNU General Public License version 3 or later,
+// see the file COPYING.txt in the main directory.
+
+template <typename Type>
+static inline void swap2(Type &x, Type &y)
+// swap values
+{
+  Type t(x);
+  x = y;
+  y = t;
+}
+
+template <typename Type>
+static inline void swap0(Type &x, Type &y)
+// swap() for y known to be zero
+{
+  y = x;
+  x = 0;
+}
+
+#endif  // !defined HAVE_SWAP_H__
diff --git a/include/permanent/glynn.h b/include/permanent/glynn.h
index 4969f7e..69d8861 100644
--- a/include/permanent/glynn.h
+++ b/include/permanent/glynn.h
@@ -10,350 +10,125 @@
 namespace permanent {
 
 template <typename T, typename I = void>
-result_t<T, I> glynn_square(const size_t m, const size_t n, const T *ptr)
+result_t<T, I> glynn_square(const size_t m, const size_t, const T *ptr)
 {
-  (void)n;
+  using namespace complex_ops;
 
-  /* Fill delta array (all +1 to start), and permutation array with [0...m]. */
-
-  sgn_t delta[64];
-  size_t perm[64 + 1];
-  for (size_t i = 0; i < m; ++i) {
-    perm[i] = i;
-    delta[i] = 1;
-  }
-
-  /* Handle first permutation. */
-
-  // result_t<T, I> vec[64];
-  result_t<T, I> out = 1;
-  for (size_t j = 0; j < m; ++j) {
-    result_t<T, I> sum = 0;
-    for (size_t i = 0; i < m; ++i) {
-      sum += ptr[i * m + j] * delta[i];
-    }
-    out *= sum;
-    // vec[j] = sum;
-  }
-  // for (size_t j = 0; j < m; ++j) {
-  //     out *= vec[j];
-  // }
-
-  /* Iterate from the second to the final permutation. */
-
-  std::size_t bound = m - 1;
-  std::size_t pos = 0;
-  sgn_t sign = 1;
-  while (pos != bound) {
-    /* Update sign and delta. */
-
-    sign *= -1;
-    delta[bound - pos] *= -1;
-
-    /* Compute term. */
-
-    result_t<T, I> prod = 1;
-    for (size_t j = 0; j < m; ++j) {
-      result_t<T, I> sum = 0;
-      for (size_t i = 0; i < m; ++i) {
-        sum += ptr[i * m + j] * delta[i];
-      }
-      // vec[j] = sum;
-      prod *= sum;
-    }
-
-    /* Multiply by the product of the vectors in delta. */
-
-    // for (size_t i = 0; i < m; ++i) {
-    //     prod *= vec[i];
-    // }
-    out += sign * prod;
-
-    /* Go to next permutation. */
-
-    perm[0] = 0;
-    perm[pos] = perm[pos + 1];
-    ++pos;
-    perm[pos] = pos;
-    pos = perm[0];
-  }
-
-  /* Divide by external factor and return permanent. */
-
-  return out / (1UL << bound);
-}
-
-template <typename T, typename I = void>
-result_t<T, I> glynn_rectangular(const size_t m, const size_t n, const T *ptr)
-{
-  /* Fill delta array (all +1 to start), and permutation array with [0...n].
-   */
-
-  sgn_t delta[64];
-  size_t perm[64 + 1];
-  for (size_t i = 0; i < n; ++i) {
+  sgn_t delta[64];        // delta array (all +1 to start)
+  size_t perm[64 + 1];    // permutation array with {0, ..., m}
+  result_t<T, I> vec[64]; // intermediate row sums
+  result_t<T, I> out = 1; // result
+  for (size_t i = 0; i != m; ++i) {
     delta[i] = 1;
-    perm[i] = i;
-  }
-
-  /* Handle first permutation. */
-
-  result_t<T, I> out = 1;
-  for (size_t j = 0; j < n; ++j) {
-    result_t<T, I> sum = 0;
-    for (size_t i = 0; i < m; ++i) {
-      sum += ptr[i * n + j] * delta[i];
-    }
-    for (size_t k = m; k < n; ++k) {
-      sum += delta[k];
-    }
-    // vec[j] = sum;
-    out *= sum;
-  }
-  // for (i = 0; i < n; ++i) {
-  //     out *= vec[i];
-  // }
-
-  /* Iterate from the second to the final permutation. */
-
-  size_t pos = 0;
-  size_t bound = n - 1;
-  sgn_t sign = 1;
-  while (pos != bound) {
-    /* Update sign and delta. */
-
-    sign *= -1;
-    delta[bound - pos] *= -1;
-
-    /* Compute term. */
-
-    result_t<T, I> prod = 1;
-    for (size_t j = 0; j < n; ++j) {
-      result_t<T, I> sum = 0;
-      for (size_t i = 0; i < m; ++i) {
-        sum += ptr[i * n + j] * delta[i];
-      }
-
-      for (size_t k = m; k < n; ++k) {
-        sum += delta[k];
-      }
-      prod *= sum;
-    }
-
-    /* Multiply by the product of the vectors in delta. */
-
-    // T prod = 1.0;
-    // for (i = 0; i < n; ++i) {
-    //     prod *= vec[i];
-    // }
-    out += sign * prod;
-
-    /* Go to next permutation. */
-
-    perm[0] = 0;
-    perm[pos] = perm[pos + 1];
-    ++pos;
-    perm[pos] = pos;
-    pos = perm[0];
   }
-
-  /* Divide by external factor and return permanent. */
-
-  return (out / (1UL << bound)) / factorial<result_t<T, I>>(n - m);
-}
-
-template <typename T, typename I = void>
-result_t<T, I> glynn_square1(const size_t m, const size_t n, const T *ptr)
-{
-  (void)n;
-
-  // Fill delta array ([+1...+1]) and permutation array ([0...m])
-  size_t perm[64 + 1];
-  sgn_t delta[64];
   for (size_t i = 0; i != m; ++i) {
     perm[i] = i;
-    delta[i] = 1;
   }
 
-  // Compute first permutation
-  result_t<T, I> vec[64];
-  result_t<T, I> out = 1;
+  // first permutation
   for (size_t j = 0; j != m; ++j) {
-    // Compute inner terms
     result_t<T, I> sum = 0;
     for (size_t i = 0; i != m; ++i) {
-      sum += ptr[i * m + j];
+      sum += ptr[i * m + j] * delta[i];
     }
     vec[j] = sum;
-
-    // Compute first outer term
-    out *= sum;
+  }
+  for (size_t j = 0; j != m; ++j) {
+      out *= vec[j];
   }
 
-  // Iterate from the second to the final permutation
+  // further permutations
   size_t bound = m - 1;
   size_t pos = 0;
-  sgn_t sign = -1;
+  sgn_t sign = 1;
   while (pos != bound) {
-    // Update delta
-    size_t idx = bound - pos;
-    delta[idx] *= -1;
-
-    // Update inner terms
-    for (size_t i = 0; i != m; ++i) {
-      vec[i] += ptr[idx * m + i] * delta[idx] * 2;
+    // update sign and delta
+    sign *= -1;
+    delta[bound - pos] *= -1;
+    // compute term
+    for (size_t j = 0; j != m; ++j) {
+      vec[j] += delta[bound - pos] * ptr[m * (bound - pos) + j] * 2;
     }
-
-    // Add product of inner terms to outer term
     result_t<T, I> prod = 1;
-    for (size_t i = 0; i != m; ++i) {
-      prod *= vec[i];
+    for (size_t i = 0; i < m; ++i) {
+        prod *= vec[i];
     }
     out += sign * prod;
-
-    // Go to next permutation
+    // advance permutation
     perm[0] = 0;
     perm[pos] = perm[pos + 1];
     ++pos;
     perm[pos] = pos;
     pos = perm[0];
-    sign *= -1;
   }
 
-  // Divide outer term by external factor and return permanent
-  return out / (1UL << bound);
+  return out / (static_cast<size_t>(1) << bound);
 }
 
 template <typename T, typename I = void>
-result_t<T, I> glynn_rectangular1(const size_t m, const size_t n, const T *ptr)
+result_t<T, I> glynn_rectangular(const size_t m, const size_t n, const T *ptr)
 {
-  // Fill delta array ([+1...+1]) and permutation array ([0...m])
-  size_t perm[64 + 1];
-  sgn_t delta[64];
+  using namespace complex_ops;
+
+  sgn_t delta[64];        // delta array (all +1 to start)
+  size_t perm[64 + 1];    // permutation array with {0, ..., m}
+  result_t<T, I> vec[64]; // intermediate row sums
+  result_t<T, I> out = 1; // result
   for (size_t i = 0; i != n; ++i) {
-    perm[i] = i;
     delta[i] = 1;
   }
-
-  // Compute first permutation
-  result_t<T, I> vec[64];
-  result_t<T, I> out = n * (n - m);
-  for (size_t j = 0; j != m; ++j) {
-    // Compute inner terms
-    result_t<T, I> sum = 0;
-    for (size_t i = 0; i != n; ++i) {
-      sum += ptr[j * n + i];
-    }
-    vec[j] = sum;
-
-    // Compute first outer term
-    out *= sum;
-  }
-  for (size_t j = m; j != n; ++j) {
-    vec[j] = n;
-  }
-
-  // Iterate from the second to the final permutation
-  size_t bound = n - 1;
-  size_t pos = 0;
-  sgn_t sign = -1;
-  while (pos != bound) {
-    // Update delta
-    size_t idx = bound - pos;
-    delta[idx] *= -1;
-
-    // Update inner terms
-    for (size_t i = 0; i != m; ++i) {
-      vec[i] += ptr[i * n + idx] * delta[idx] * 2;
-    }
-    for (size_t i = m; i != n; ++i) {
-      vec[i] += delta[idx] * 2;
-    }
-
-    // Add product of inner terms to outer term
-    result_t<T, I> prod = 1;
-    for (size_t i = 0; i != n; ++i) {
-      prod *= vec[i];
-    }
-    out += sign * prod;
-
-    // Go to next permutation
-    perm[0] = 0;
-    perm[pos] = perm[pos + 1];
-    ++pos;
-    perm[pos] = pos;
-    pos = perm[0];
-    sign *= -1;
-  }
-
-  // Divide outer term by external factor and return permanent
-  return (out / (1UL << bound)) / factorial<result_t<T, I>>(n - m);
-}
-
-template <typename T, typename I = void>
-result_t<T, I> glynn_rectangular2(const size_t m, const size_t n, const T *ptr)
-{
-  // Fill delta array ([+1...+1]) and permutation array ([0...m])
-  size_t perm[64 + 1];
-  sgn_t delta[64];
   for (size_t i = 0; i != n; ++i) {
     perm[i] = i;
-    delta[i] = 1;
   }
 
-  // Compute first permutation
-  result_t<T, I> vec[64];
-  result_t<T, I> out = 1;
+  // first permutation
   for (size_t j = 0; j != n; ++j) {
-    // Compute inner terms
-    result_t<T, I> sum = n - m;
+    result_t<T, I> sum = 0;
     for (size_t i = 0; i != m; ++i) {
-      sum += ptr[i * n + j];
+      sum += ptr[i * n + j] * delta[i];
+    }
+    for (size_t i = m; i != n; ++i) {
+      sum += delta[i];
     }
     vec[j] = sum;
-
-    // Compute first outer term
-    out *= sum;
+  }
+  for (size_t j = 0; j != n; ++j) {
+      out *= vec[j];
   }
 
-  // Iterate from the second to the final permutation
-  size_t bound = n - 1;
+  // further permutations
   size_t pos = 0;
-  sgn_t sign = -1;
+  size_t bound = n - 1;
+  sgn_t sign = 1;
   while (pos != bound) {
-    // Update delta
-    size_t idx = bound - pos;
-    delta[idx] *= -1;
-
-    // Update inner terms
-    if (idx < m) {
-      for (size_t i = 0; i != n; ++i) {
-        vec[i] += ptr[idx * n + i] * delta[idx] * 2;
+    // update sign and delta
+    sign *= -1;
+    delta[bound - pos] *= -1;
+    // compute term
+    if (bound - pos < m) {
+      for (size_t j = 0; j != n; ++j) {
+        vec[j] += delta[bound - pos] * ptr[n * (bound - pos) + j] * 2;
       }
     } else {
-      for (size_t i = 0; i != n; ++i) {
-        vec[i] += delta[idx] * 2;
+      for (size_t j = 0; j != n; ++j) {
+        vec[j] += delta[bound - pos] * 2;
       }
     }
-
-    // Add product of inner terms to outer term
     result_t<T, I> prod = 1;
     for (size_t i = 0; i != n; ++i) {
-      prod *= vec[i];
+        prod *= vec[i];
     }
     out += sign * prod;
 
-    // Go to next permutation
+    // advance permutation
     perm[0] = 0;
     perm[pos] = perm[pos + 1];
     ++pos;
     perm[pos] = pos;
     pos = perm[0];
-    sign *= -1;
   }
 
-  // Divide outer term by external factor and return permanent
-  return (out / (1UL << bound)) / factorial<result_t<T, I>>(n - m);
+  return (out / (static_cast<size_t>(1) << bound)) / factorial<result_t<T, I>>(n - m);
 }
 
 template <typename T, typename I = void>
@@ -364,4 +139,4 @@ result_t<T, I> glynn(const size_t m, const size_t n, const T *ptr)
 
 }  // namespace permanent
 
-#endif  // permanent_glynn_h_
+#endif  // permanent_glynn_h_
\ No newline at end of file
diff --git a/include/permanent/opt.h b/include/permanent/opt.h
index 24e3b00..d9fc2fc 100644
--- a/include/permanent/opt.h
+++ b/include/permanent/opt.h
@@ -3,9 +3,10 @@
 #if !defined(permanent_opt_h_)
 #define permanent_opt_h_
 
-#include <permanent/combinatoric.h>
 #include <permanent/common.h>
 #include <permanent/complex.h>
+
+#include <permanent/combinatoric.h>
 #include <permanent/glynn.h>
 #include <permanent/ryser.h>
 
diff --git a/include/permanent/ryser.h b/include/permanent/ryser.h
index 441c977..a1fefdd 100644
--- a/include/permanent/ryser.h
+++ b/include/permanent/ryser.h
@@ -7,172 +7,129 @@
 #include <permanent/complex.h>
 #include <permanent/tables.h>
 
-#include <bit>
-#include <vector>
-
-namespace {
-
-inline void all_combinations(int n, int k, std::vector<std::vector<int>> &k_perms)
-{
-  k_perms.clear();  // Clear k_perms to make sure it starts empty
-  k_perms.reserve(1 << n);
-
-  for (int i = 0; i < (1 << n); i++) {
-    unsigned int cur = i ^ (i >> 1);
-    if (std::popcount(cur) == k) {
-      std::vector<int> vec;  // Initialize vector for new combination
-      vec.reserve(n);
-      for (int j = 0; j < n; j++) {
-        if (cur & (1 << j)) {
-          vec.push_back(j);  // Add element to vector
-        }
-      }
-      k_perms.push_back(vec);  // Store the combination
-    }
-  }
-}
-
-template <typename T>
-auto parity(const T &x)
-{
-  return 1 - ((x & 1) << 1);
-};
-
-template <typename T>
-void swap2(T *perm, T i, T j)
-{
-  const T temp = perm[i];
-  perm[i] = perm[j];
-  perm[j] = temp;
-}
-
-template <typename T>
-void init_perm(const T N, T *const the_fact_set, T *const the_perm_set, T *const the_inv_set)
-{
-  for (T i = 0; i < N; i++) {
-    the_fact_set[i + 1] = 0;
-    the_perm_set[i] = i;
-    the_inv_set[i] = i;
-  }
-}
-
-template <typename T>
-bool gen_next_perm(T *const falling_fact, T *const perm, T *const inv_perm, const T cols,
-                   const T u_)
-{
-  /* Use the current permutation to check for next permutation
-   * lexicographically, if it exists update the curr_array by swapping the
-   * leftmost changed element (at position i < k). Replace the elements up to
-   * position k by the smallest element lying to the right of i. Do not put
-   * remaining k, .., n - 1 positions in ascending order to improve efficiency
-   * has time complexity O(n) in the worst case. */
-  T i = u_ - 1;
-  T m1 = cols - i - 1;
-
-  /* Begin update of falling_fact - recall falling_fact[0] = 0, so increment
-   * index accordingly. If i becomes negative during the check, you are
-   * pointing to the sentinel value, so you are done generating
-   * permutations. */
-  while (falling_fact[i + 1] == m1) {
-    falling_fact[i + 1] = 0;
-    ++m1;
-    --i;
-  }
-  if (i == 0UL - 1) {
-    return false;
-  }
-  ++falling_fact[i + 1];
-
-  /* Find smallest element perm[i] < perm[j] that lies to the right of
-   * pos i, and then update the state of the permutation using its inverse
-   * to generate next. */
-  T z = perm[i];
-  do {
-    ++z;
-  } while (inv_perm[z] <= i);
-  const T j = inv_perm[z];
-  swap2(perm, i, j);
-  swap2(inv_perm, perm[i], perm[j]);
-  ++i;
-  T y = 0;
-
-  /* Find the smallest elements to the right of position i. */
-  while (i < u_) {
-    while (inv_perm[y] < i) {
-      ++y;
-    }
-    const T k = inv_perm[y];
-    swap2(perm, i, k);
-    swap2(inv_perm, perm[i], perm[k]);
-    ++i;
-  }
-  return true;
-}
-
-}  // namespace
-
 namespace permanent {
 
 template <typename T, typename I = void>
 result_t<T, I> ryser_square(const size_t m, const size_t, const T *ptr)
 {
-  T out = 0;
-  size_t c = 1UL << m;
-
-  for (size_t k = 0; k < c; ++k) {
-    T rowsumprod = 1.0;
-    for (size_t i = 0; i < m; ++i) {
-      T rowsum = 0.0;
-      for (size_t j = 0; j < m; ++j) {
-        if (k & (1UL << j)) {
-          rowsum += ptr[m * i + j];
+  using namespace complex_ops;
+
+  size_t subset[65] = {};          // k-subset with {1,..,n}
+  result_t<T, I> rowsums[64] = {}; // intermediate row sums
+  result_t<T, I> prods[64] = {};   // intermediate products of row sums
+
+  size_t k = 0;
+  do {
+    result_t<T, I> prod = 1;
+    // update subset
+    if (k & 1) { // odd
+      if (subset[k] - 1 == subset[k - 1]) { // remove subset[k] - 1 from position k - 1
+        for (size_t row = 0; row != m; ++row) {
+          prod *= (rowsums[row] -= ptr[m * row + subset[k] - 1 - 1]);
         }
+        subset[k - 1] = subset[k];
+        --k;
+      } else { // insert subset[k] - 1 as second last element
+        for (size_t row = 0; row != m; ++row) {
+          prod *= (rowsums[row] += ptr[m * row + subset[k] - 1 - 1]);
+        }
+        subset[k + 1] = subset[k];
+        --subset[k];
+        ++k;
+      }
+    } else { // even
+      if (subset[k] == m) { // remove m from end
+        for (size_t row = 0; row != m; ++row) {
+          prod *= (rowsums[row] -= ptr[m * row + m - 1]);
+        }
+        --k;
+      } else { // append m
+        for (size_t row = 0; row != m; ++row) {
+          prod *= (rowsums[row] += ptr[m * row + m - 1]);
+        }
+        ++k;
+        subset[k] = m;
       }
-      rowsumprod *= rowsum;
     }
-    out += rowsumprod * (1 - ((std::popcount(k) & 1) << 1));
+    prods[k - 1] += prod;
+  } while (k);
+
+  result_t<T, I> out = 0;
+  for (size_t i = 0; i != m; ++i) {
+    out += prods[i] * (1 - (((ptrdiff_t)(i + 1) & 1) << 1));
   }
 
-  return static_cast<result_t<T, I>>(out * ((m % 2 == 1) ? -1 : 1));
+  return out * (1 - (((ptrdiff_t)m & 1) << 1));//(m & 1 ? -1 : 1);
 }
 
 template <typename T, typename I = void>
 result_t<T, I> ryser_rectangular(const size_t m, const size_t n, const T *ptr)
 {
-  int sign = 1;
-  result_t<T, I> out = 0.0;
-  std::vector<std::vector<int>> k_perms;
-
-  /* Iterate over subsets from size 0 to size m */
-
-  for (size_t k = 0; k < m; ++k) {
-    all_combinations(n, m - k, k_perms);
-    size_t bin = binomial((n - m + k), k);
-    result_t<T, I> permsum = 0.0;
-
-    /* Compute permanents of each submatrix */
-    result_t<T, I> vec[64];
-    for (const auto &combination : k_perms) {
-      result_t<T, I> colprod;
-      for (size_t i = 0; i < m; ++i) {
-        result_t<T, I> matsum = 0.0;
-        for (size_t j = 0; j < (m - k); ++j) matsum += ptr[i * n + combination[j]];
-        vec[i] = matsum;
-      }
+  using namespace complex_ops;
 
-      colprod = 1.0;
-      for (size_t i = 0; i < m; ++i) colprod *= vec[i];
-      permsum += colprod * sign * bin;
-    }
+  size_t subset[65] = { 0, 1 }; // k-subset with {1,...,n}
+  result_t<T, I> rowsums[64];           // intermediate row sums
+  result_t<T, I> prods[64] = { 1 };     // intermediate products of row sums
 
-    /* Add term to result */
+  for (size_t row = 0; row != m; ++row) {
+    prods[0] *= (rowsums[row] = ptr[n * row]);
+  }
 
-    out += permsum;
+  for (size_t k = 1; subset[1] != n;) {
+    result_t<T, I> prod = 1;
+    if (k & 1) { // odd
+      if (subset[k] == n) { // remove n from end
+        for (size_t row = 0; row != m; ++row) {
+          prod *= (rowsums[row] -= ptr[n * row + n - 1]);
+        }
+        --k;
+      } else {
+        if (k < m) { // append n
+          for (size_t row = 0; row != m; ++row) {
+            prod *= (rowsums[row] += ptr[n * row + n - 1]);
+          }
+          ++k;
+          subset[k] = n;
+        } else { // increment subset[j] -> subset[j] + 1
+          for (size_t row = 0; row != m; ++row) {
+            prod *= (rowsums[row] += ptr[n * row + subset[k] + 1 - 1] - ptr[n * row + subset[k] - 1]);
+          }
+          ++subset[k];
+        }
+      }
+    } else { // even
+      if (subset[k - 1] == subset[k] - 1) { // remove subset[j] - 1 from position j - 1
+        for (size_t row = 0; row != m; ++row) {
+          prod *= (rowsums[row] -= ptr[n * row + subset[k] - 1 - 1]);
+        }
+        subset[k - 1] = subset[k];
+        --k;
+      } else {
+        --subset[k];
+        if (k < m) { // insert subset[j] - 1 as second last element
+          for (size_t row = 0; row != m; ++row) {
+            prod *= (rowsums[row] += ptr[n * row + subset[k] - 1 - 0]);
+          }
+          subset[k + 1] = subset[k] + 1;
+          ++k;
+        } else { // decrement subset[j] -> sunset[j] - 1
+          for (size_t row = 0; row != m; ++row) {
+            prod *= (rowsums[row] += ptr[n * row + subset[k] - 1 - 0] - ptr[n * row + subset[k] - 0]);
+          }
+        }
+      }
+    }
+    prods[k - 1] += prod;
+  }
 
-    sign *= -1;
+  result_t<T, I> out = 0;
+  for (size_t i = 0; i != m; ++i) {
+    out += (prods[i] * binomial(n - i - 1, m - i - 1)) * (1 - (((ptrdiff_t)(m - i - 1) & 1) << 1));
   }
 
   return out;
+
+  return out;
 }
 
 template <typename T, typename I = void>
@@ -183,4 +140,4 @@ result_t<T, I> ryser(const size_t m, const size_t n, const T *ptr)
 
 }  // namespace permanent
 
-#endif  // permanent_ryser_h_
+#endif  // permanent_ryser_h_
\ No newline at end of file
diff --git a/include/permanent/tables.h b/include/permanent/tables.h
index 6a7bd0e..7ffbd12 100644
--- a/include/permanent/tables.h
+++ b/include/permanent/tables.h
@@ -3,11 +3,12 @@
 #if !defined(permanent_tables_h_)
 #define permanent_tables_h_
 
-#include <permanent/common.h>
-
 #include <cstdint>
+
 #include <type_traits>
 
+#include <permanent/common.h>
+
 namespace permanent {
 
 static constexpr size_t binom_table[] = {
@@ -4301,4 +4302,4 @@ inline std::enable_if_t<!std::is_integral_v<T>, double> factorial(const size_t n
 
 }  // namespace permanent
 
-#endif  // permanent_tables_h_
+#endif  // permanent_tables_h_
\ No newline at end of file
diff --git a/tests/test_permanent.py b/tests/test_permanent.py
index 9a089f1..3ec1559 100644
--- a/tests/test_permanent.py
+++ b/tests/test_permanent.py
@@ -3,6 +3,7 @@
 import numpy as np
 import numpy.testing as npt
 import permanent
+import permanent
 import pytest
 
 ATOL = 0e0
@@ -22,6 +23,7 @@
 @pytest.mark.parametrize(
     "args",
     [
+        (np.arange(1, 2, dtype=float).reshape(1, 1), 1),
         (np.arange(1, 5, dtype=float).reshape(2, 2), 10),
         (np.arange(1, 5, dtype=int).reshape(2, 2), 10),
         (np.arange(1, 5, dtype=complex).reshape(2, 2), 10),
@@ -29,6 +31,8 @@
         (np.arange(1, 17, dtype=float).reshape(4, 4), 55456),
         (np.arange(1, 50, dtype=float).reshape(7, 7), 5373548250000),
         # Rectangular matrices
+        (np.arange(1, 3, dtype=float).reshape(1, 2), 3),
+        (np.arange(1, 4, dtype=float).reshape(1, 3), 6),
         (np.arange(1, 7, dtype=float).reshape(2, 3), 58),
         (np.arange(1, 9, dtype=float).reshape(2, 4), 190),
         (np.arange(1, 15, dtype=float).reshape(2, 7), 1820),
@@ -92,4 +96,4 @@ def test_int_dim_diff_gt_20_raises(fn, arg):
 def test_invalid_type_raises(fn, arg):
     r"""Ensure that matrices with an invalid dtype raise a ValueError."""
     with npt.assert_raises(TypeError):
-        fn(arg)
+        fn(arg)
\ No newline at end of file