diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb58acb8be..5a4f4f6833 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,9 @@ with the exception that minor releases may include breaking changes.
 
 ### Added
 
+- ✨ Add a `fuse-two-qubit-unitary-runs` pass
+  for fusing compile-time two-qubit unitary windows via Weyl/KAK resynthesis
+  ([#1655]) ([**@simon1hofmann**])
 - ✨ Add a `fuse-single-qubit-unitary-runs` pass
   for fusing compile-time single-qubit unitary runs via Euler resynthesis
   ([#1672]) ([**@simon1hofmann**], [**@burgholzer**])
@@ -632,6 +635,7 @@ changelogs._
 [#1664]: https://github.com/munich-quantum-toolkit/core/pull/1664
 [#1662]: https://github.com/munich-quantum-toolkit/core/pull/1662
 [#1660]: https://github.com/munich-quantum-toolkit/core/pull/1660
+[#1655]: https://github.com/munich-quantum-toolkit/core/pull/1655
 [#1652]: https://github.com/munich-quantum-toolkit/core/pull/1652
 [#1638]: https://github.com/munich-quantum-toolkit/core/pull/1638
 [#1637]: https://github.com/munich-quantum-toolkit/core/pull/1637
diff --git a/mlir/include/mlir/Compiler/CompilerPipeline.h b/mlir/include/mlir/Compiler/CompilerPipeline.h
index 54d9f039de..1844a75dd7 100644
--- a/mlir/include/mlir/Compiler/CompilerPipeline.h
+++ b/mlir/include/mlir/Compiler/CompilerPipeline.h
@@ -49,6 +49,18 @@ struct QuantumCompilerConfig {
 
   /// Enable Hadamard lifting
   bool enableHadamardLifting = false;
+
+  /// Comma-separated native gate menu. Recognised tokens: `u`, `x`, `sx`,
+  /// `rz` (or `p`), `rx`, `ry`, `r`, `cx`, `cz`, `rzz`.
+  /// Illustrative menus (use `cx` or `cz` as the entangler, or
+  /// both):
+  /// - `"x,sx,rz,cx"` / `"x,sx,rz,cz"` — IBM basic (no fractional 2q)
+  /// - `"x,sx,rz,rx,rzz,cx"` / `"...,cz"` — IBM fractional
+  /// - `"u,cx"` / `"u,cz"` — generic single-qubit U3 + CX/CZ
+  /// - `"r,cz"` — IQM-style default
+  /// - `"rx,rz,cx"`, `"rx,ry,cz"`, `"ry,rz,cx"` — supported RX/RY/RZ pairs plus
+  /// entangler
+  std::string nativeGates;
 };
 
 /**
@@ -84,7 +96,7 @@ struct CompilationRecord {
  * 2. QC cleanup pipeline
  * 3. QCO dialect (value semantics) - enables SSA-based optimizations
  * 4. QCO cleanup pipeline
- * 5. Quantum optimization passes
+ * 5. Optimization and native gate synthesis
  * 6. QCO cleanup pipeline
  * 7. QC dialect - converted back for backend lowering
  * 8. QC cleanup pipeline
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h
index 8fb0018b28..5d02898c3b 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h
@@ -32,6 +32,7 @@ enum class EulerBasis : std::uint8_t {
   XYX = 3,  ///< `RX(phi) * RY(theta) * RX(lambda)`.
   U = 4,    ///< `U(theta, phi, lambda)`.
   ZSXX = 5, ///< `RZ` / `SX` / `X` synthesis via ZYZ decomposition.
+  R = 6,    ///< `R(.,0) * R(.,pi/2) * R(.,0)` (XYX with `Rx`/`Ry` as `R`).
 };
 
 /**
@@ -42,6 +43,31 @@ enum class EulerBasis : std::uint8_t {
  */
 [[nodiscard]] std::optional<EulerBasis> parseEulerBasis(StringRef basis);
 
+/**
+ * @brief Euler angles `(theta, phi, lambda)` and global phase for a 2x2
+ * unitary.
+ *
+ * The decomposition obeys `matrix == e^{i*phase} * K(phi) * A(theta) *
+ * K(lambda)` where `(K, A)` are the rotation axes of the chosen @ref
+ * EulerBasis.
+ */
+struct EulerAngles {
+  double theta = 0.0;  ///< Middle rotation angle.
+  double phi = 0.0;    ///< First outer rotation angle.
+  double lambda = 0.0; ///< Second outer rotation angle.
+  double phase = 0.0;  ///< Global phase in radians.
+};
+
+/**
+ * @brief Extracts `(theta, phi, lambda, phase)` of @p matrix in @p basis.
+ *
+ * @param matrix The single-qubit unitary to decompose.
+ * @param basis The target Euler basis.
+ * @return The extracted Euler angles and global phase.
+ */
+[[nodiscard]] EulerAngles anglesFromUnitary(const Matrix2x2& matrix,
+                                            EulerBasis basis);
+
 /**
  * @brief Synthesizes a composed single-qubit unitary as gates in @p basis.
  *
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/NativeProfile.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/NativeProfile.h
new file mode 100644
index 0000000000..f9abed48d7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/NativeProfile.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LLVM.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cstdint>
+#include <optional>
+
+namespace mlir::qco::decomposition {
+
+/**
+ * @brief Native gate kinds that may appear in a two-qubit synthesis menu.
+ */
+enum class NativeGateKind : std::uint8_t {
+  U,
+  X,
+  Sx,
+  Rz,
+  Rx,
+  Ry,
+  R,
+  Cx,
+  Cz,
+  Rzz,
+};
+
+/**
+ * @brief Single-qubit emission strategy resolved from a native-gate menu.
+ */
+enum class SingleQubitMode : std::uint8_t {
+  ZSXX,     ///< `RZ` / `SX` / `X` via ZYZ decomposition.
+  U3,       ///< Generic `U(theta, phi, lambda)`.
+  R,        ///< `R(theta, phi)` chain (`Rx`/`Ry` as `R`).
+  AxisPair, ///< Two fixed rotation axes (see @ref AxisPair).
+};
+
+/**
+ * @brief Rotation-axis pair for @ref SingleQubitMode::AxisPair emitters.
+ */
+enum class AxisPair : std::uint8_t {
+  RxRz,
+  RxRy,
+  RyRz,
+};
+
+/**
+ * @brief Entangling basis gate for two-qubit Weyl synthesis.
+ */
+enum class EntanglerBasis : std::uint8_t {
+  None,
+  Cx,
+  Cz,
+};
+
+struct SingleQubitEmitterSpec {
+  SingleQubitMode mode = SingleQubitMode::U3;
+  AxisPair axisPair = AxisPair::RxRz;
+  bool supportsDirectRx = false;
+};
+
+/**
+ * @brief Resolved native-gate menu for two-qubit Weyl synthesis.
+ */
+struct NativeProfileSpec {
+  bool allowRzz = false;
+  llvm::DenseSet<NativeGateKind> allowedGates;
+  llvm::SmallVector<SingleQubitEmitterSpec> singleQubitEmitters;
+  llvm::SmallVector<EntanglerBasis> entanglerBases;
+};
+
+/**
+ * @brief Euler basis used to emit single-qubit factors for @p emitter.
+ */
+[[nodiscard]] EulerBasis
+emitterEulerBasis(const SingleQubitEmitterSpec& emitter);
+
+/**
+ * @brief Parses a comma-separated native-gate menu (e.g. `"u,cx,rzz"`).
+ */
+[[nodiscard]] std::optional<NativeProfileSpec>
+parseNativeSpec(StringRef nativeGates);
+
+/**
+ * @brief Synthesizes a composed two-qubit unitary as gates in @p spec.
+ */
+[[nodiscard]] LogicalResult
+synthesizeUnitary2QWeyl(OpBuilder& builder, Location loc, Value qubit0,
+                        Value qubit1, const Matrix4x4& target,
+                        const NativeProfileSpec& spec, Value& outQubit0,
+                        Value& outQubit1);
+
+/**
+ * @brief Number of entangling basis gates required to synthesize @p target.
+ *
+ * @return Entangler count for @p spec, or `std::nullopt` if synthesis fails.
+ */
+[[nodiscard]] std::optional<std::uint8_t>
+twoQubitEntanglerCount(const Matrix4x4& target, const NativeProfileSpec& spec);
+
+/**
+ * @brief Maps a compile-time single-qubit op to its native-menu gate kind.
+ *
+ * Returns `std::nullopt` for non-primitive or unsupported ops.
+ */
+[[nodiscard]] std::optional<NativeGateKind>
+nativeGateKindFor(UnitaryOpInterface op);
+
+/**
+ * @brief Returns true when @p op is already on the resolved single-qubit menu.
+ *
+ * `BarrierOp` and `GPhaseOp` are always allowed.
+ */
+[[nodiscard]] bool allowsSingleQubitOp(UnitaryOpInterface op,
+                                       const NativeProfileSpec& spec);
+
+/**
+ * @brief Entangler basis for a 1-control, 1-target `CtrlOp` with `X`/`Z` body.
+ */
+[[nodiscard]] std::optional<EntanglerBasis>
+entanglerBasisForSingleTargetCtrl(CtrlOp ctrl);
+
+/** @brief Returns true when @p spec lists @p basis as an entangler. */
+[[nodiscard]] bool profileAllowsEntangler(const NativeProfileSpec& spec,
+                                          EntanglerBasis basis);
+
+/**
+ * @brief Returns true when a 1-control, 1-target `CtrlOp` is on @p spec.
+ */
+[[nodiscard]] bool allowsSingleTargetCtrl(CtrlOp ctrl,
+                                          const NativeProfileSpec& spec);
+
+/**
+ * @brief Returns true when a bare two-qubit op (currently `RZZ`) is on @p spec.
+ */
+[[nodiscard]] bool allowsBareTwoQubitOp(Operation* op,
+                                        const NativeProfileSpec& spec);
+
+/**
+ * @brief Returns true when @p op is a native two-qubit gate under @p spec.
+ */
+[[nodiscard]] bool allowsTwoQubitOp(Operation* op,
+                                    const NativeProfileSpec& spec);
+
+/**
+ * @brief Fills @p matrix with the 4x4 unitary for a two-qubit block op.
+ *
+ * Handles single-target `CtrlOp` shells and generic two-qubit unitaries.
+ * Returns false for barriers, global phase, and unsupported shapes.
+ */
+[[nodiscard]] bool assignTwoQubitOpMatrix(Operation* op, Matrix4x4& matrix);
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Weyl.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Weyl.h
new file mode 100644
index 0000000000..a05b1ab2bb
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Weyl.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/SmallVector.h>
+
+#include <array>
+#include <complex>
+#include <cstdint>
+#include <optional>
+
+namespace mlir::qco::decomposition {
+
+/**
+ * @brief Weyl decomposition of a 2-qubit unitary matrix (4x4).
+ *
+ * The result consists of four 2x2 single-qubit matrices (`k1l`, `k2l`,
+ * `k1r`, `k2r`) and three parameters for a canonical gate (`a`, `b`, `c`).
+ * The canonical gate is `RXX(-2 * a) * RYY(-2 * b) * RZZ(-2 * c)`.
+ *
+ * @note Adapted from TwoQubitWeylDecomposition in the IBM Qiskit framework.
+ *       (C) Copyright IBM 2023
+ *
+ *       This code is licensed under the Apache License, Version 2.0. You may
+ *       obtain a copy of this license in the LICENSE.txt file in the root
+ *       directory of this source tree or at
+ *       https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ *       Any modifications or derivative works of this code must retain this
+ *       copyright notice, and modified files need to carry a notice
+ *       indicating that they have been altered from the originals.
+ */
+class TwoQubitWeylDecomposition {
+public:
+  static TwoQubitWeylDecomposition create(const Matrix4x4& unitaryMatrix,
+                                          std::optional<double> fidelity);
+
+  ~TwoQubitWeylDecomposition() = default;
+  TwoQubitWeylDecomposition() = default;
+  TwoQubitWeylDecomposition(const TwoQubitWeylDecomposition&) = default;
+  TwoQubitWeylDecomposition(TwoQubitWeylDecomposition&&) = default;
+  TwoQubitWeylDecomposition&
+  operator=(const TwoQubitWeylDecomposition&) = default;
+  TwoQubitWeylDecomposition& operator=(TwoQubitWeylDecomposition&&) = default;
+
+  [[nodiscard]] Matrix4x4 getCanonicalMatrix() const {
+    return getCanonicalMatrix(a_, b_, c_);
+  }
+
+  [[nodiscard]] double a() const { return a_; }
+  [[nodiscard]] double b() const { return b_; }
+  [[nodiscard]] double c() const { return c_; }
+  [[nodiscard]] double globalPhase() const { return globalPhase_; }
+
+  /**
+   * @brief Left single-qubit factor after the canonical gate.
+   *
+   * ```
+   * q1 - k2r - C -  k1r  -
+   *            A
+   * q0 - k2l - N - *k1l* -
+   * ```
+   */
+  [[nodiscard]] const Matrix2x2& k1l() const { return k1l_; }
+  /**
+   * @brief Left single-qubit factor before the canonical gate.
+   *
+   * ```
+   * q1 -  k2r  - C - k1r -
+   *              A
+   * q0 - *k2l* - N - k1l -
+   * ```
+   */
+  [[nodiscard]] const Matrix2x2& k2l() const { return k2l_; }
+  /**
+   * @brief Right single-qubit factor after the canonical gate.
+   *
+   * ```
+   * q1 - k2r - C - *k1r* -
+   *            A
+   * q0 - k2l - N -  k1l  -
+   * ```
+   */
+  [[nodiscard]] const Matrix2x2& k1r() const { return k1r_; }
+  /**
+   * @brief Right single-qubit factor before the canonical gate.
+   *
+   * ```
+   * q1 - *k2r* - C - k1r -
+   *              A
+   * q0 -  k2l  - N - k1l -
+   * ```
+   */
+  [[nodiscard]] const Matrix2x2& k2r() const { return k2r_; }
+
+  [[nodiscard]] static Matrix4x4 getCanonicalMatrix(double a, double b,
+                                                    double c);
+
+private:
+  bool applySpecialization();
+
+  double a_{};
+  double b_{};
+  double c_{};
+  double globalPhase_{};
+  Matrix2x2 k1l_;
+  Matrix2x2 k2l_;
+  Matrix2x2 k1r_;
+  Matrix2x2 k2r_;
+  std::uint8_t specializationKind_{0};
+  std::optional<double> requestedFidelity;
+};
+
+using TwoQubitLocalUnitaryList = llvm::SmallVector<Matrix2x2, 8>;
+
+/**
+ * @brief Result of a two-qubit basis decomposition as single-qubit factors and
+ *        entangler uses.
+ *
+ * Factors are stored in emission order. For `i` in `[0, numBasisUses)` the
+ * pair `(singleQubitFactors[2*i], singleQubitFactors[2*i + 1])` is applied to
+ * qubits `1` and `0` respectively, followed by one entangler. The final pair
+ * `(singleQubitFactors[2*numBasisUses], singleQubitFactors[2*numBasisUses+1])`
+ * is applied after the last entangler. The list therefore has length
+ * `2 * (numBasisUses + 1)`.
+ */
+struct TwoQubitNativeDecomposition {
+  std::uint8_t numBasisUses = 0;
+  TwoQubitLocalUnitaryList singleQubitFactors;
+  double globalPhase = 0.0;
+};
+
+/**
+ * @brief Decomposer initialized with a two-qubit basis gate for canonical-gate
+ *        (RXX+RYY+RZZ) synthesis.
+ *
+ * @note Adapted from TwoQubitBasisDecomposer in the IBM Qiskit framework.
+ *       (C) Copyright IBM 2023
+ *
+ *       This code is licensed under the Apache License, Version 2.0. You may
+ *       obtain a copy of this license in the LICENSE.txt file in the root
+ *       directory of this source tree or at
+ *       https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ *       Any modifications or derivative works of this code must retain this
+ *       copyright notice, and modified files need to carry a notice
+ *       indicating that they have been altered from the originals.
+ */
+class TwoQubitBasisDecomposer {
+public:
+  [[nodiscard]] static TwoQubitBasisDecomposer
+  create(const Matrix4x4& basisMatrix, double basisFidelity);
+
+  [[nodiscard]] std::optional<TwoQubitNativeDecomposition>
+  twoQubitDecompose(const TwoQubitWeylDecomposition& targetDecomposition,
+                    std::optional<std::uint8_t> numBasisGateUses) const;
+
+private:
+  struct SmbPrecomputed {
+    Matrix2x2 u0l;
+    Matrix2x2 u0r;
+    Matrix2x2 u1l;
+    Matrix2x2 u1ra;
+    Matrix2x2 u1rb;
+    Matrix2x2 u2la;
+    Matrix2x2 u2lb;
+    Matrix2x2 u2ra;
+    Matrix2x2 u2rb;
+    Matrix2x2 u3l;
+    Matrix2x2 u3r;
+    Matrix2x2 q0l;
+    Matrix2x2 q0r;
+    Matrix2x2 q1la;
+    Matrix2x2 q1lb;
+    Matrix2x2 q1ra;
+    Matrix2x2 q1rb;
+    Matrix2x2 q2l;
+    Matrix2x2 q2r;
+  };
+
+  [[nodiscard]] static TwoQubitLocalUnitaryList
+  decomp0(const TwoQubitWeylDecomposition& target);
+  [[nodiscard]] TwoQubitLocalUnitaryList
+  decomp1(const TwoQubitWeylDecomposition& target) const;
+  [[nodiscard]] TwoQubitLocalUnitaryList
+  decomp2Supercontrolled(const TwoQubitWeylDecomposition& target) const;
+  [[nodiscard]] TwoQubitLocalUnitaryList
+  decomp3Supercontrolled(const TwoQubitWeylDecomposition& target) const;
+  [[nodiscard]] std::array<std::complex<double>, 4>
+  traces(const TwoQubitWeylDecomposition& target) const;
+
+  double basisFidelity{};
+  TwoQubitWeylDecomposition basisDecomposer;
+  bool isSuperControlled{};
+  SmbPrecomputed smb{};
+};
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h
index 7444438a88..1919b50b0c 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h
@@ -14,6 +14,8 @@
 #include <mlir/Pass/Pass.h>
 #include <mlir/Pass/PassRegistry.h>
 
+#include <string>
+
 namespace mlir::qco {
 
 #define GEN_PASS_DECL
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
index fafb906a77..b1c3bf2962 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
@@ -66,6 +66,54 @@ def FuseSingleQubitUnitaryRuns
                         "Target Euler basis (zyz, zxz, xzx, xyx, u, zsxx).">];
 }
 
+def FuseTwoQubitUnitaryRuns
+    : Pass<"fuse-two-qubit-unitary-runs", "mlir::ModuleOp"> {
+  let dependentDialects = ["mlir::qco::QCODialect"];
+  let summary = "Lower QCO unitary gates to a user-specified native gate menu.";
+  let description = [{
+    Rewrites unitaries to match the comma-separated `native-gates` menu.
+    `qco.barrier` and `qco.gphase` are preserved; `qco.ctrl` must have a
+    single control and a single target.
+
+    The menu is a comma-separated list of gate tokens (order not significant)
+    from which the pass builds a profile: a single-qubit synthesis mode
+    (generic `qco.u` when `u` is present; IBM-style surface gates when all of
+    `x`, `sx`, and `rz`/`p` are present; IQM-style `qco.r` when `r` is present;
+    or a supported rotation pair chosen from `rx`, `ry`, `rz`) plus optional
+    two-qubit entanglers `cx`, `cz`, and optional `rzz`.
+
+    Recognised tokens: `u`, `x`, `sx`, `rz` (or `p`), `rx`, `ry`, `r`, `cx`,
+    `cz`, `rzz`. An empty or whitespace-only menu is a no-op, which is the
+    intended pipeline default when synthesis is not needed. An unrecognised
+    token causes the pass to fail.
+
+    Example menus (each line is one illustrative menu; pick either `cx` or
+    `cz` as the entangler, or list both if both are native):
+    - IBM basic (no fractional two-qubit): `x,sx,rz,cx` or `x,sx,rz,cz`
+    - IBM fractional: `x,sx,rz,rx,rzz,cx` or `x,sx,rz,rx,rzz,cz`
+    - Generic single-qubit U: `u,cx` or `u,cz`
+    - IQM default: `r,cz` (or `r,cx` if CX is the native entangler)
+    - Rotation pair + entangler: `rx,rz,cx`, `rx,ry,cz`, `ry,rz,cx`, etc.
+      Supported pairs are exactly `rx`+`rz`, `rx`+`ry`, and `ry`+`rz`.
+
+    Stages: fuse single-qubit runs; fuse two-qubit windows; up to four
+    synthesis sweeps over remaining off-menu ops; fuse 1q seams; up to four
+    further synthesis and fusion rounds until the full menu holds. The pass
+    fails if anything remains off-menu after those caps.
+
+    Lowering is deterministic: `cx` is preferred over `cz`, single-qubit
+    factors use the first emitter's Euler basis, and two-qubit window
+    replacement uses the minimal entangler count from the synthesizer.
+  }];
+  let options = [Option<
+      "nativeGates", "native-gates", "std::string", "\"\"",
+      "Comma-separated native gate menu. Empty or whitespace-only is "
+      "a no-op. Tokens: u, x, sx, rz (or p), rx, ry, r, cx, cz, rzz. "
+      "Examples: x,sx,rz,cx; x,sx,rz,rx,rzz,cz; u,cx; r,cz; rx,rz,cx.">];
+}
+
+//===----------------------------------------------------------------------===//
+
 def QuantumLoopUnroll
     : InterfacePass<"quantum-loop-unroll", "FunctionOpInterface"> {
   let dependentDialects = ["mlir::qco::QCODialect", "mlir::scf::SCFDialect"];
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h b/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h
index 8f49d372d7..d1e1c68446 100644
--- a/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h
+++ b/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h
@@ -183,6 +183,12 @@ struct Matrix2x2 {
    */
   [[nodiscard]] Matrix2x2 adjoint() const;
 
+  /**
+   * @brief Returns the (non-conjugate) transpose of this matrix.
+   * @return Transposed matrix `A^T`.
+   */
+  [[nodiscard]] Matrix2x2 transpose() const;
+
   /**
    * @brief Returns the trace of this matrix.
    * @return Sum of diagonal entries.
@@ -195,6 +201,13 @@ struct Matrix2x2 {
    */
   [[nodiscard]] Complex determinant() const;
 
+  /**
+   * @brief Checks whether this matrix is approximately the identity.
+   * @param tol Maximum allowed complex modulus of each entry difference.
+   * @return True if every entry is within @p tol of the identity.
+   */
+  [[nodiscard]] bool isIdentity(double tol = MATRIX_TOLERANCE) const;
+
   /**
    * @brief Checks approximate equality using an absolute entry-wise tolerance.
    *
@@ -322,6 +335,12 @@ struct Matrix4x4 {
    */
   [[nodiscard]] Matrix4x4 adjoint() const;
 
+  /**
+   * @brief Returns the (non-conjugate) transpose of this matrix.
+   * @return Transposed matrix `A^T`.
+   */
+  [[nodiscard]] Matrix4x4 transpose() const;
+
   /**
    * @brief Returns the trace of this matrix.
    * @return Sum of diagonal entries.
@@ -334,6 +353,53 @@ struct Matrix4x4 {
    */
   [[nodiscard]] Complex determinant() const;
 
+  /**
+   * @brief Checks whether this matrix is approximately the identity.
+   * @param tol Maximum allowed complex modulus of each entry difference.
+   * @return True if every entry is within @p tol of the identity.
+   */
+  [[nodiscard]] bool isIdentity(double tol = MATRIX_TOLERANCE) const;
+
+  /**
+   * @brief Returns the four diagonal entries `(m00, m11, m22, m33)`.
+   * @return Array of diagonal entries.
+   */
+  [[nodiscard]] std::array<Complex, K_ROWS> diagonal() const;
+
+  /**
+   * @brief Builds a diagonal matrix from four diagonal entries.
+   * @param diagonalEntries Diagonal entries `(m00, m11, m22, m33)`.
+   * @return Diagonal matrix with the given entries.
+   */
+  [[nodiscard]] static Matrix4x4
+  fromDiagonal(const std::array<Complex, K_ROWS>& diagonalEntries);
+
+  /**
+   * @brief Returns the entries of column @p col, top to bottom.
+   * @param col Column index in `[0, K_COLS)`.
+   * @return Array of the four column entries.
+   */
+  [[nodiscard]] std::array<Complex, K_ROWS> column(std::size_t col) const;
+
+  /**
+   * @brief Overwrites column @p col with @p values.
+   * @param col Column index in `[0, K_COLS)`.
+   * @param values New column entries, top to bottom.
+   */
+  void setColumn(std::size_t col, const std::array<Complex, K_ROWS>& values);
+
+  /**
+   * @brief Returns the element-wise real parts in row-major order.
+   * @return Real parts of all entries.
+   */
+  [[nodiscard]] std::array<double, K_SIZE_AT_COMPILE_TIME> realPart() const;
+
+  /**
+   * @brief Returns the element-wise imaginary parts in row-major order.
+   * @return Imaginary parts of all entries.
+   */
+  [[nodiscard]] std::array<double, K_SIZE_AT_COMPILE_TIME> imagPart() const;
+
   /**
    * @brief Checks approximate equality using an absolute entry-wise tolerance.
    *
@@ -539,6 +605,34 @@ class DynamicMatrix {
   [[nodiscard]] bool isApprox(const DynamicMatrix& other,
                               double tol = MATRIX_TOLERANCE) const;
 
+  /**
+   * @brief Returns the trace of this matrix.
+   * @return Sum of diagonal entries.
+   */
+  [[nodiscard]] Complex trace() const;
+
+  /**
+   * @brief Matrix product `*this * rhs`.
+   * @param rhs Right-hand factor.
+   * @return Product of the two matrices.
+   */
+  [[nodiscard]] DynamicMatrix operator*(const DynamicMatrix& rhs) const;
+
+  /**
+   * @brief Element-wise scaling by a complex scalar.
+   * @param scalar Factor applied to every matrix entry.
+   * @return Scaled copy of this matrix.
+   */
+  [[nodiscard]] DynamicMatrix operator*(const Complex& scalar) const;
+
+  /**
+   * @brief Checks whether this matrix is approximately the identity.
+   * @param tol Maximum allowed complex modulus of each off-diagonal entry and
+   * each diagonal deviation from one.
+   * @return True when the matrix is close to the identity.
+   */
+  [[nodiscard]] bool isIdentity(double tol = MATRIX_TOLERANCE) const;
+
 private:
   struct Impl;
   std::unique_ptr<Impl> impl_;
@@ -558,4 +652,148 @@ inline constexpr bool
     std::disjunction_v<std::is_same<T, Matrix1x1>, std::is_same<T, Matrix2x2>,
                        std::is_same<T, Matrix4x4>,
                        std::is_same<T, DynamicMatrix>>;
+
+/**
+ * @brief Kronecker product `lhs (x) rhs` of two single-qubit matrices.
+ *
+ * Uses the computational-basis bit order where the first operand labels the
+ * high bit, matching `UnitaryOpInterface::getUnitaryMatrix4x4`.
+ *
+ * @param lhs Left factor (acts on the high bit / qubit 0).
+ * @param rhs Right factor (acts on the low bit / qubit 1).
+ * @return The `4x4` Kronecker product.
+ */
+[[nodiscard]] Matrix4x4 kron(const Matrix2x2& lhs, const Matrix2x2& rhs);
+
+/// Scalar-on-the-left multiply `scalar * matrix` (commutes with the member
+/// `matrix * scalar`). Provided so generic code can scale a matrix from
+/// either side.
+[[nodiscard]] Matrix2x2 operator*(const Complex& scalar,
+                                  const Matrix2x2& matrix);
+/// @copydoc operator*(const Complex&, const Matrix2x2&)
+[[nodiscard]] Matrix4x4 operator*(const Complex& scalar,
+                                  const Matrix4x4& matrix);
+/// @copydoc operator*(const Complex&, const Matrix2x2&)
+[[nodiscard]] DynamicMatrix operator*(const Complex& scalar,
+                                      const DynamicMatrix& matrix);
+
+/**
+ * @brief Eigenvalues and eigenvectors of a real symmetric `4x4` matrix.
+ *
+ * `eigenvalues` are sorted ascending and `eigenvectors` holds the
+ * corresponding orthonormal eigenvectors as columns (column `j` is the
+ * eigenvector for `eigenvalues[j]`), matching the convention of
+ * `Eigen::SelfAdjointEigenSolver`.
+ */
+struct SymmetricEigen4 {
+  std::array<double, 4> eigenvalues{};
+  Matrix4x4 eigenvectors{};
+};
+
+/**
+ * @brief Computes the eigendecomposition of a real symmetric `4x4` matrix.
+ *
+ * Implements the cyclic Jacobi eigenvalue algorithm, which is numerically
+ * robust for small symmetric matrices and yields orthonormal eigenvectors
+ * even for degenerate spectra.
+ *
+ * @param symmetric Row-major real symmetric `4x4` matrix.
+ * @return Ascending eigenvalues and matching eigenvectors (as columns).
+ */
+[[nodiscard]] SymmetricEigen4
+jacobiSymmetricEigen(const std::array<double, 16>& symmetric);
+
+/// `SWAP` on two qubits.
+[[nodiscard]] const Matrix4x4& twoQubitSwapMatrix();
+
+/**
+ * @brief Embed a single-qubit matrix into the two-qubit space on @p qubitIndex.
+ *
+ * @param qubitIndex `0` for the high bit, `1` for the low bit.
+ */
+[[nodiscard]] Matrix4x4 embedSingleQubitInTwoQubit(const Matrix2x2& matrix,
+                                                   std::size_t qubitIndex);
+
+/**
+ * @brief Reorder a two-qubit matrix to act on qubits `{0, 1}`.
+ *
+ * @param q0Index Wire index of operand 0; @p q1Index wire index of operand 1.
+ */
+[[nodiscard]] Matrix4x4 reorderTwoQubitMatrix(const Matrix4x4& matrix,
+                                              std::size_t q0Index,
+                                              std::size_t q1Index);
+
+/**
+ * @brief Embed a single-qubit matrix into an @p numQubits-qubit Hilbert space.
+ *
+ * Qubit @p qubitIndex uses the same MSB-first convention as
+ * @ref embedSingleQubitInTwoQubit.
+ */
+[[nodiscard]] DynamicMatrix embedSingleQubitInNqubit(const Matrix2x2& matrix,
+                                                     std::size_t numQubits,
+                                                     std::size_t qubitIndex);
+
+/**
+ * @brief Embed a two-qubit matrix into an @p numQubits-qubit Hilbert space.
+ *
+ * Operand 0 labels the high bit of the pair and acts on @p q0Index; operand 1
+ * labels the low bit and acts on @p q1Index.
+ */
+[[nodiscard]] DynamicMatrix embedTwoQubitInNqubit(const Matrix4x4& matrix,
+                                                  std::size_t numQubits,
+                                                  std::size_t q0Index,
+                                                  std::size_t q1Index);
+
+inline constexpr double FRAC1_SQRT2 =
+    0.707106781186547524400844362104849039284835937688474036588;
+
+/**
+ * @brief Non-negative remainder of @p a modulo @p b.
+ *
+ * Unlike `std::fmod`, the result is always in `[0, |b|)`.
+ */
+[[nodiscard]] double remEuclid(double a, double b);
+
+/**
+ * @brief Average two-qubit gate fidelity from a Hilbert-Schmidt trace.
+ *
+ * Maps `|tr(U^dag V)|` to fidelity via `(4 + |tr|^2) / 20`.
+ */
+[[nodiscard]] double traceToFidelity(const Complex& trace);
+
+/** @brief Unit-modulus global phase factor `exp(i * phase)`. */
+[[nodiscard]] Complex globalPhaseFactor(double phase);
+
+/** @brief Returns true when @p matrix is unitary within @p tolerance. */
+[[nodiscard]] bool isUnitaryMatrix(const Matrix2x2& matrix,
+                                   double tolerance = MATRIX_TOLERANCE);
+
+[[nodiscard]] Matrix2x2 rxMatrix(double theta);
+[[nodiscard]] Matrix2x2 ryMatrix(double theta);
+[[nodiscard]] Matrix2x2 rzMatrix(double theta);
+
+[[nodiscard]] const Matrix2x2& iPauliX();
+[[nodiscard]] const Matrix2x2& iPauliY();
+[[nodiscard]] const Matrix2x2& iPauliZ();
+
+[[nodiscard]] Matrix4x4 rxxMatrix(double theta);
+[[nodiscard]] Matrix4x4 ryyMatrix(double theta);
+[[nodiscard]] Matrix4x4 rzzMatrix(double theta);
+
+/**
+ * @brief Controlled-`X` with control on qubit 0 (MSB) and target on qubit 1.
+ *
+ * Operand order matches @ref embedSingleQubitInTwoQubit and QCO unitaries.
+ */
+[[nodiscard]] const Matrix4x4& twoQubitControlledX01();
+
+/**
+ * @brief Controlled-`X` with control on qubit 1 and target on qubit 0.
+ *
+ * Same entangling content as @ref twoQubitControlledX01 but with wires
+ * reversed; useful when parametrizing basis-decomposer tests.
+ */
+[[nodiscard]] const Matrix4x4& twoQubitControlledX10();
+[[nodiscard]] const Matrix4x4& twoQubitControlledZ();
+
 } // namespace mlir::qco
diff --git a/mlir/lib/Compiler/CompilerPipeline.cpp b/mlir/lib/Compiler/CompilerPipeline.cpp
index 821ccda2ee..ca1aae19f5 100644
--- a/mlir/lib/Compiler/CompilerPipeline.cpp
+++ b/mlir/lib/Compiler/CompilerPipeline.cpp
@@ -90,7 +90,7 @@ QuantumCompilerPipeline::runPipeline(ModuleOp module,
   // 2. QC cleanup
   // 3. QC-to-QCO conversion
   // 4. QCO cleanup
-  // 5. Optimization passes
+  // 5. Optimization and Native Gate Synthesis
   // 6. QCO cleanup
   // 7. QCO-to-QC conversion
   // 8. QC cleanup
@@ -145,7 +145,7 @@ QuantumCompilerPipeline::runPipeline(ModuleOp module,
                        totalStages);
     }
   }
-  // Stage 5: Optimization passes
+  // Stage 5: Optimization and native gate synthesis
   if (failed(runStage([&](PassManager& pm) {
         if (!config_.disableMergeSingleQubitRotationGates) {
           pm.addPass(qco::createMergeSingleQubitRotationGates());
@@ -153,14 +153,18 @@ QuantumCompilerPipeline::runPipeline(ModuleOp module,
         if (config_.enableHadamardLifting) {
           pm.addPass(qco::createHadamardLifting());
         }
+        pm.addPass(qco::createFuseTwoQubitUnitaryRuns(
+            qco::FuseTwoQubitUnitaryRunsOptions{
+                .nativeGates = config_.nativeGates,
+            }));
       }))) {
     return failure();
   }
   if (record != nullptr && config_.recordIntermediates) {
     record->afterOptimization = captureIR(module);
     if (config_.printIRAfterAllStages) {
-      prettyPrintStage(module, "Optimization Passes", ++currentStage,
-                       totalStages);
+      prettyPrintStage(module, "Optimization and Native Gate Synthesis",
+                       ++currentStage, totalStages);
     }
   }
   // Stage 6: QCO cleanup
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp
index 1b742170fa..1baee612d9 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp
@@ -115,9 +115,14 @@ std::optional<Matrix2x2> ROp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto thetaSin = std::sin(*theta / 2);
-  const auto m01 = std::polar(thetaSin, -*phi - (std::numbers::pi / 2));
-  const auto m10 = std::polar(thetaSin, *phi - (std::numbers::pi / 2));
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `sin(theta / 2)` is negative for negative `theta`. Build the
+  // phased entries via `sin * e^{i*phi}` instead, which is well-defined for any
+  // sign of the magnitude.
+  const auto m01 = thetaSin * std::exp(1i * (-*phi - (std::numbers::pi / 2)));
+  const auto m10 = thetaSin * std::exp(1i * (*phi - (std::numbers::pi / 2)));
   const auto thetaCos = std::cos(*theta / 2);
   return Matrix2x2::fromElements(thetaCos, m01,  // row 0
                                  m10, thetaCos); // row 1
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp
index f504ba3a49..4cbe633883 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp
@@ -133,12 +133,16 @@ std::optional<Matrix2x2> UOp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto c = std::cos(*theta / 2);
   const auto s = std::sin(*theta / 2);
 
-  const auto m01 = std::polar(s, *lambda + std::numbers::pi);
-  const auto m10 = std::polar(s, *phi);
-  const auto m11 = std::polar(c, *phi + *lambda);
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `sin`/`cos` of `theta / 2` can be negative. Build the phased
+  // entries via `mag * e^{i*phi}` instead, which is well-defined for any sign.
+  const auto m01 = s * std::exp(1i * (*lambda + std::numbers::pi));
+  const auto m10 = s * std::exp(1i * (*phi));
+  const auto m11 = c * std::exp(1i * (*phi + *lambda));
   return Matrix2x2::fromElements(c, m01,    // row 0
                                  m10, m11); // row 1
 }
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp
index c75c45f26e..24e5fbf8c5 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp
@@ -79,10 +79,15 @@ std::optional<Matrix4x4> XXMinusYYOp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto mc = std::cos(*theta / 2);
   const auto s = std::sin(*theta / 2);
-  const auto msp = std::polar(s, *beta - (std::numbers::pi / 2));
-  const auto msm = std::polar(s, -*beta - (std::numbers::pi / 2));
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `s = sin(theta / 2)` is negative for negative `theta`. Build the
+  // phased entries via `s * e^{i*phi}` instead, which is well-defined for any
+  // sign of `s`.
+  const auto msp = s * std::exp(1i * (*beta - (std::numbers::pi / 2)));
+  const auto msm = s * std::exp(1i * (-*beta - (std::numbers::pi / 2)));
   return Matrix4x4::fromElements(mc, 0, 0, msm,  // row 0
                                  0, 1, 0, 0,     // row 1
                                  0, 0, 1, 0,     // row 2
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp
index 6511b2344b..4cfb663658 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp
@@ -79,10 +79,15 @@ std::optional<Matrix4x4> XXPlusYYOp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto mc = std::cos(*theta / 2);
   const auto s = std::sin(*theta / 2);
-  const auto msp = std::polar(s, *beta - (std::numbers::pi / 2));
-  const auto msm = std::polar(s, -*beta - (std::numbers::pi / 2));
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `s = sin(theta / 2)` is negative for negative `theta`. Build the
+  // phased entries via `s * e^{i*phi}` instead, which is well-defined for any
+  // sign of `s`.
+  const auto msp = s * std::exp(1i * (*beta - (std::numbers::pi / 2)));
+  const auto msm = s * std::exp(1i * (-*beta - (std::numbers::pi / 2)));
   return Matrix4x4::fromElements(1, 0, 0, 0,    // row 0
                                  0, mc, msp, 0, // row 1
                                  0, msm, mc, 0, // row 2
diff --git a/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt b/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt
index 3564b55dc5..d56388d402 100644
--- a/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt
@@ -22,11 +22,11 @@ add_mlir_library(
   DEPENDS
   MLIRQCOTransformsIncGen)
 
-# collect header files
-file(GLOB_RECURSE PASSES_HEADERS_SOURCE
-     ${MQT_MLIR_SOURCE_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.h)
-file(GLOB_RECURSE PASSES_HEADERS_BUILD
-     ${MQT_MLIR_BUILD_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.inc)
+# collect header files (subdirs: Decomposition/, …)
+file(GLOB_RECURSE PASSES_HEADERS_SOURCE CONFIGURE_DEPENDS
+     "${MQT_MLIR_SOURCE_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.h")
+file(GLOB_RECURSE PASSES_HEADERS_BUILD CONFIGURE_DEPENDS
+     "${MQT_MLIR_BUILD_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.inc")
 
 # add public headers using file sets
 target_sources(
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp
index 085d493abf..5f017109c5 100644
--- a/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp
@@ -102,21 +102,6 @@ static void emitGPhaseIfNeeded(OpBuilder& builder, Location loc, double phase) {
 // Euler decomposition (angles)
 //===----------------------------------------------------------------------===//
 
-/**
- * @brief Euler angles `(theta, phi, lambda)` and global phase for a 2x2
- * unitary.
- */
-namespace {
-
-struct EulerAngles {
-  double theta = 0.0;  ///< Middle rotation angle.
-  double phi = 0.0;    ///< First outer rotation angle.
-  double lambda = 0.0; ///< Second outer rotation angle.
-  double phase = 0.0;  ///< Global phase in radians.
-};
-
-} // namespace
-
 /**
  * @brief Z-Y-Z Euler angles and global phase for a 2x2 unitary.
  *
@@ -197,15 +182,7 @@ struct EulerAngles {
           .phase = phase - (0.5 * (phi + lambda))};
 }
 
-/**
- * @brief Extracts `(theta, phi, lambda, phase)` for all Euler bases.
- *
- * @param matrix The single-qubit unitary to decompose.
- * @param basis The target Euler basis.
- * @return The extracted Euler angles and global phase.
- */
-[[nodiscard]] static EulerAngles anglesFromUnitary(const Matrix2x2& matrix,
-                                                   const EulerBasis basis) {
+EulerAngles anglesFromUnitary(const Matrix2x2& matrix, const EulerBasis basis) {
   switch (basis) {
   case EulerBasis::ZYZ:
   case EulerBasis::ZSXX:
@@ -215,6 +192,9 @@ struct EulerAngles {
   case EulerBasis::XZX:
     return paramsXZX(matrix);
   case EulerBasis::XYX:
+  case EulerBasis::R:
+    // The `R` basis reuses the X-Y-X angles and lowers `Rx`/`Ry` to the native
+    // `R(theta, phi)` gate (`Rx(a) == R(a, 0)`, `Ry(a) == R(a, pi/2)`).
     return paramsXYX(matrix);
   case EulerBasis::U:
     return paramsU(matrix);
@@ -235,7 +215,7 @@ namespace {
  * `RZ`/`RY`/`RX` use @p theta as the rotation angle; `U` uses all three angles.
  */
 struct SynthesisStep {
-  enum class Kind : std::uint8_t { RZ, RY, RX, SX, X, U };
+  enum class Kind : std::uint8_t { RZ, RY, RX, SX, X, U, R };
 
   Kind kind = Kind::RZ;
   double theta = 0.0;
@@ -264,6 +244,19 @@ struct Unitary1QEulerPlan {
     }
   }
 
+  /**
+   * @brief Appends a native `R(angle, axis)` step for non-negligible angles.
+   *
+   * @param angle The rotation angle in radians.
+   * @param axis The rotation axis in the XY-plane (`0` for `Rx`, `pi/2` for
+   *             `Ry`).
+   */
+  void appendRStep(const double angle, const double axis) {
+    if (!isNearZeroRotationAngle(angle)) {
+      steps.emplace_back(SynthesisStep::Kind::R, angle, axis);
+    }
+  }
+
   /**
    * @brief Appends the decomposition for @p basis based on @p angles.
    *
@@ -290,6 +283,9 @@ struct Unitary1QEulerPlan {
       case EulerBasis::XYX:
         appendRotation(SynthesisStep::Kind::RX, angles.phi + angles.lambda);
         break;
+      case EulerBasis::R:
+        appendRStep(angles.phi + angles.lambda, 0.0);
+        break;
       case EulerBasis::U:
         steps.emplace_back(SynthesisStep::Kind::U, 0.0, angles.phi,
                            angles.lambda);
@@ -324,6 +320,14 @@ struct Unitary1QEulerPlan {
       appendRotation(SynthesisStep::Kind::RX, angles.phi);
       phase = angles.phase;
       break;
+    case EulerBasis::R:
+      // X-Y-X with `Rx(a) == R(a, 0)` and `Ry(a) == R(a, pi/2)`.
+      appendRStep(angles.lambda, 0.0);
+      steps.emplace_back(SynthesisStep::Kind::R, angles.theta,
+                         std::numbers::pi / 2.0);
+      appendRStep(angles.phi, 0.0);
+      phase = angles.phase;
+      break;
     case EulerBasis::U:
       steps.emplace_back(SynthesisStep::Kind::U, angles.theta, angles.phi,
                          angles.lambda);
@@ -413,6 +417,9 @@ emitUnitary1QEulerPlan(OpBuilder& builder, Location loc, Value qubit,
       qubit =
           UOp::create(builder, loc, qubit, theta, phi, lambda).getQubitOut();
       break;
+    case SynthesisStep::Kind::R:
+      qubit = ROp::create(builder, loc, qubit, theta, phi).getQubitOut();
+      break;
     }
   }
   emitGPhaseIfNeeded(builder, loc, plan.phase);
@@ -427,6 +434,7 @@ std::optional<EulerBasis> parseEulerBasis(StringRef basis) {
       .Case("xyx", EulerBasis::XYX)
       .Case("u", EulerBasis::U)
       .Case("zsxx", EulerBasis::ZSXX)
+      .Case("r", EulerBasis::R)
       .Default(std::nullopt);
 }
 
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/NativeProfile.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/NativeProfile.cpp
new file mode 100644
index 0000000000..d954355bdc
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/NativeProfile.cpp
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/NativeProfile.h"
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Weyl.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LLVM.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+using mlir::qco::Matrix2x2;
+using mlir::qco::Matrix4x4;
+
+namespace mlir::qco::decomposition {
+
+static std::optional<NativeGateKind> parseGateToken(llvm::StringRef name) {
+  return llvm::StringSwitch<std::optional<NativeGateKind>>(name)
+      .Case("u", NativeGateKind::U)
+      .Case("x", NativeGateKind::X)
+      .Case("sx", NativeGateKind::Sx)
+      .Cases("rz", "p", NativeGateKind::Rz)
+      .Case("rx", NativeGateKind::Rx)
+      .Case("ry", NativeGateKind::Ry)
+      .Case("r", NativeGateKind::R)
+      .Case("cx", NativeGateKind::Cx)
+      .Case("cz", NativeGateKind::Cz)
+      .Case("rzz", NativeGateKind::Rzz)
+      .Default(std::nullopt);
+}
+
+static std::optional<llvm::DenseSet<NativeGateKind>>
+parseGateSet(llvm::StringRef nativeGates) {
+  llvm::DenseSet<NativeGateKind> gates;
+  llvm::SmallVector<llvm::StringRef> parts;
+  nativeGates.split(parts, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  for (llvm::StringRef part : parts) {
+    const auto token = part.trim().lower();
+    if (token.empty()) {
+      continue;
+    }
+    const auto gate = parseGateToken(token);
+    if (!gate) {
+      return std::nullopt;
+    }
+    gates.insert(*gate);
+  }
+  return gates;
+}
+
+static SingleQubitEmitterSpec
+makeEmitterSpec(SingleQubitMode mode, AxisPair axisPair = AxisPair::RxRz,
+                bool supportsDirectRx = false) {
+  return {
+      .mode = mode, .axisPair = axisPair, .supportsDirectRx = supportsDirectRx};
+}
+
+static void
+addEmitterIfAbsent(llvm::SmallVectorImpl<SingleQubitEmitterSpec>& emitters,
+                   SingleQubitMode mode, AxisPair axisPair = AxisPair::RxRz,
+                   bool supportsDirectRx = false) {
+  const bool present = llvm::any_of(emitters, [&](const auto& e) {
+    return e.mode == mode && e.axisPair == axisPair &&
+           e.supportsDirectRx == supportsDirectRx;
+  });
+  if (!present) {
+    emitters.push_back(makeEmitterSpec(mode, axisPair, supportsDirectRx));
+  }
+}
+
+static llvm::SmallVector<NativeGateKind, 4>
+allowedGatesForEmitter(const SingleQubitEmitterSpec& emitter) {
+  switch (emitter.mode) {
+  case SingleQubitMode::ZSXX: {
+    llvm::SmallVector<NativeGateKind, 4> gates{
+        NativeGateKind::X, NativeGateKind::Sx, NativeGateKind::Rz};
+    if (emitter.supportsDirectRx) {
+      gates.push_back(NativeGateKind::Rx);
+    }
+    return gates;
+  }
+  case SingleQubitMode::U3:
+    return {NativeGateKind::U};
+  case SingleQubitMode::R:
+    return {NativeGateKind::R};
+  case SingleQubitMode::AxisPair:
+    switch (emitter.axisPair) {
+    case AxisPair::RxRz:
+      return {NativeGateKind::Rx, NativeGateKind::Rz};
+    case AxisPair::RxRy:
+      return {NativeGateKind::Rx, NativeGateKind::Ry};
+    case AxisPair::RyRz:
+      return {NativeGateKind::Ry, NativeGateKind::Rz};
+    }
+    break;
+  }
+  llvm_unreachable("unknown single-qubit mode");
+}
+
+static llvm::SmallVector<NativeGateKind, 2>
+allowedGatesForEntangler(EntanglerBasis entangler) {
+  switch (entangler) {
+  case EntanglerBasis::None:
+    return {};
+  case EntanglerBasis::Cx:
+    return {NativeGateKind::Cx};
+  case EntanglerBasis::Cz:
+    return {NativeGateKind::Cz};
+  }
+  llvm_unreachable("unknown entangler basis");
+}
+
+static void populateAllowedGates(NativeProfileSpec& spec) {
+  spec.allowedGates.clear();
+  for (const auto& emitter : spec.singleQubitEmitters) {
+    const auto allowed = allowedGatesForEmitter(emitter);
+    spec.allowedGates.insert(allowed.begin(), allowed.end());
+  }
+  for (const auto entangler : spec.entanglerBases) {
+    const auto allowed = allowedGatesForEntangler(entangler);
+    spec.allowedGates.insert(allowed.begin(), allowed.end());
+  }
+  if (spec.allowRzz) {
+    spec.allowedGates.insert(NativeGateKind::Rzz);
+  }
+}
+
+static std::optional<EntanglerBasis>
+selectEntangler(const NativeProfileSpec& spec) {
+  if (llvm::is_contained(spec.entanglerBases, EntanglerBasis::Cx)) {
+    return EntanglerBasis::Cx;
+  }
+  if (llvm::is_contained(spec.entanglerBases, EntanglerBasis::Cz)) {
+    return EntanglerBasis::Cz;
+  }
+  return std::nullopt;
+}
+
+static Matrix4x4 entanglerMatrix(EntanglerBasis entangler) {
+  return entangler == EntanglerBasis::Cz ? mlir::qco::twoQubitControlledZ()
+                                         : mlir::qco::twoQubitControlledX01();
+}
+
+static std::optional<TwoQubitNativeDecomposition>
+decomposeWithEntangler(const Matrix4x4& target, EntanglerBasis entangler) {
+  auto decomposer =
+      TwoQubitBasisDecomposer::create(entanglerMatrix(entangler), 1.0);
+  auto weyl = TwoQubitWeylDecomposition::create(target, std::nullopt);
+  return decomposer.twoQubitDecompose(weyl, std::nullopt);
+}
+
+static void emitGPhaseIfNonTrivial(OpBuilder& builder, Location loc,
+                                   double phase) {
+  constexpr double epsilon = 1e-12;
+  if (std::abs(phase) > epsilon) {
+    GPhaseOp::create(builder, loc, phase);
+  }
+}
+
+static Value emitSingleQubitMatrix(OpBuilder& builder, Location loc,
+                                   Value inQubit, const Matrix2x2& matrix,
+                                   EulerBasis basis) {
+  return *synthesizeUnitary1QEuler(builder, loc, inQubit, matrix,
+                                   /*runSize=*/0, /*hasNonBasisGate=*/true,
+                                   basis);
+}
+
+EulerBasis emitterEulerBasis(const SingleQubitEmitterSpec& emitter) {
+  switch (emitter.mode) {
+  case SingleQubitMode::ZSXX:
+    return EulerBasis::ZSXX;
+  case SingleQubitMode::U3:
+    return EulerBasis::U;
+  case SingleQubitMode::R:
+    return EulerBasis::R;
+  case SingleQubitMode::AxisPair:
+    switch (emitter.axisPair) {
+    case AxisPair::RxRz:
+      return EulerBasis::XZX;
+    case AxisPair::RxRy:
+      return EulerBasis::XYX;
+    case AxisPair::RyRz:
+      return EulerBasis::ZYZ;
+    }
+    break;
+  }
+  llvm_unreachable("unknown single-qubit mode");
+}
+
+std::optional<NativeProfileSpec> parseNativeSpec(llvm::StringRef nativeGates) {
+  const auto gates = parseGateSet(nativeGates);
+  if (!gates || gates->empty()) {
+    return std::nullopt;
+  }
+  const auto has = [&](NativeGateKind kind) { return gates->contains(kind); };
+
+  NativeProfileSpec spec;
+
+  if (has(NativeGateKind::U)) {
+    addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::U3);
+  }
+  const bool hasXSxRz = has(NativeGateKind::X) && has(NativeGateKind::Sx) &&
+                        has(NativeGateKind::Rz);
+  if (hasXSxRz) {
+    addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::ZSXX,
+                       AxisPair::RxRz,
+                       /*supportsDirectRx=*/has(NativeGateKind::Rx));
+  }
+  if (has(NativeGateKind::R)) {
+    addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::R);
+  }
+  struct AxisPairRule {
+    AxisPair axis;
+    NativeGateKind left;
+    NativeGateKind right;
+  };
+  for (const auto& rule : {
+           AxisPairRule{.axis = AxisPair::RxRz,
+                        .left = NativeGateKind::Rx,
+                        .right = NativeGateKind::Rz},
+           AxisPairRule{.axis = AxisPair::RxRy,
+                        .left = NativeGateKind::Rx,
+                        .right = NativeGateKind::Ry},
+           AxisPairRule{.axis = AxisPair::RyRz,
+                        .left = NativeGateKind::Ry,
+                        .right = NativeGateKind::Rz},
+       }) {
+    if (has(rule.left) && has(rule.right)) {
+      addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::AxisPair,
+                         rule.axis);
+    }
+  }
+  if (spec.singleQubitEmitters.empty()) {
+    return std::nullopt;
+  }
+
+  if (has(NativeGateKind::Cx)) {
+    spec.entanglerBases.push_back(EntanglerBasis::Cx);
+  }
+  if (has(NativeGateKind::Cz)) {
+    spec.entanglerBases.push_back(EntanglerBasis::Cz);
+  }
+  spec.allowRzz = has(NativeGateKind::Rzz);
+
+  populateAllowedGates(spec);
+  return spec;
+}
+
+LogicalResult synthesizeUnitary2QWeyl(OpBuilder& builder, Location loc,
+                                      Value qubit0, Value qubit1,
+                                      const Matrix4x4& target,
+                                      const NativeProfileSpec& spec,
+                                      Value& outQubit0, Value& outQubit1) {
+  const auto entangler = selectEntangler(spec);
+  if (!entangler) {
+    return failure();
+  }
+  const auto native = decomposeWithEntangler(target, *entangler);
+  if (!native) {
+    return failure();
+  }
+  const auto basis = emitterEulerBasis(spec.singleQubitEmitters.front());
+
+  emitGPhaseIfNonTrivial(builder, loc, native->globalPhase);
+
+  Value wire0 = qubit0;
+  Value wire1 = qubit1;
+  const auto& factors = native->singleQubitFactors;
+  const std::uint8_t numBasisUses = native->numBasisUses;
+  const auto emitFactor = [&](Value& wire, std::size_t index) {
+    wire = emitSingleQubitMatrix(builder, loc, wire, factors[index], basis);
+  };
+  const auto emitEntangler = [&]() {
+    auto ctrlOp = CtrlOp::create(
+        builder, loc, ValueRange{wire0}, ValueRange{wire1},
+        [&](ValueRange targetArgs) -> llvm::SmallVector<Value> {
+          if (*entangler == EntanglerBasis::Cz) {
+            return {ZOp::create(builder, loc, targetArgs[0]).getOutputQubit(0)};
+          }
+          return {XOp::create(builder, loc, targetArgs[0]).getOutputQubit(0)};
+        });
+    wire0 = ctrlOp.getOutputControl(0);
+    wire1 = ctrlOp.getOutputTarget(0);
+  };
+
+  for (std::uint8_t i = 0; i < numBasisUses; ++i) {
+    emitFactor(wire1, static_cast<std::size_t>(2 * i));
+    emitFactor(wire0, static_cast<std::size_t>((2 * i) + 1));
+    emitEntangler();
+  }
+  emitFactor(wire1, static_cast<std::size_t>(2 * numBasisUses));
+  emitFactor(wire0, static_cast<std::size_t>((2 * numBasisUses) + 1));
+
+  outQubit0 = wire0;
+  outQubit1 = wire1;
+  return success();
+}
+
+std::optional<std::uint8_t>
+twoQubitEntanglerCount(const Matrix4x4& target, const NativeProfileSpec& spec) {
+  const auto entangler = selectEntangler(spec);
+  if (!entangler) {
+    return std::nullopt;
+  }
+  const auto native = decomposeWithEntangler(target, *entangler);
+  if (!native) {
+    return std::nullopt;
+  }
+  return native->numBasisUses;
+}
+
+std::optional<NativeGateKind> nativeGateKindFor(UnitaryOpInterface op) {
+  Operation* raw = op.getOperation();
+  if (llvm::isa<UOp>(raw)) {
+    return NativeGateKind::U;
+  }
+  if (llvm::isa<XOp>(raw)) {
+    return NativeGateKind::X;
+  }
+  if (llvm::isa<SXOp>(raw)) {
+    return NativeGateKind::Sx;
+  }
+  if (llvm::isa<RZOp, POp>(raw)) {
+    return NativeGateKind::Rz;
+  }
+  if (llvm::isa<RXOp>(raw)) {
+    return NativeGateKind::Rx;
+  }
+  if (llvm::isa<RYOp>(raw)) {
+    return NativeGateKind::Ry;
+  }
+  if (llvm::isa<ROp>(raw)) {
+    return NativeGateKind::R;
+  }
+  return std::nullopt;
+}
+
+bool allowsSingleQubitOp(UnitaryOpInterface op, const NativeProfileSpec& spec) {
+  if (llvm::isa<BarrierOp, GPhaseOp>(op.getOperation())) {
+    return true;
+  }
+  const auto gate = nativeGateKindFor(op);
+  return gate && spec.allowedGates.contains(*gate);
+}
+
+std::optional<EntanglerBasis> entanglerBasisForSingleTargetCtrl(CtrlOp ctrl) {
+  if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+    return std::nullopt;
+  }
+  Operation* body = ctrl.getBodyUnitary(0).getOperation();
+  if (llvm::isa<XOp>(body)) {
+    return EntanglerBasis::Cx;
+  }
+  if (llvm::isa<ZOp>(body)) {
+    return EntanglerBasis::Cz;
+  }
+  return std::nullopt;
+}
+
+bool profileAllowsEntangler(const NativeProfileSpec& spec,
+                            EntanglerBasis basis) {
+  return llvm::is_contained(spec.entanglerBases, basis);
+}
+
+bool allowsSingleTargetCtrl(CtrlOp ctrl, const NativeProfileSpec& spec) {
+  const auto basis = entanglerBasisForSingleTargetCtrl(ctrl);
+  return basis && profileAllowsEntangler(spec, *basis);
+}
+
+bool allowsBareTwoQubitOp(Operation* op, const NativeProfileSpec& spec) {
+  return spec.allowRzz && llvm::isa<RZZOp>(op) &&
+         spec.allowedGates.contains(NativeGateKind::Rzz);
+}
+
+bool allowsTwoQubitOp(Operation* op, const NativeProfileSpec& spec) {
+  if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+    return allowsSingleTargetCtrl(ctrl, spec);
+  }
+  return allowsBareTwoQubitOp(op, spec);
+}
+
+bool assignTwoQubitOpMatrix(Operation* op, Matrix4x4& matrix) {
+  if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+    return false;
+  }
+  if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+    const auto basis = entanglerBasisForSingleTargetCtrl(ctrl);
+    if (!basis) {
+      return false;
+    }
+    matrix = *basis == EntanglerBasis::Cz ? mlir::qco::twoQubitControlledZ()
+                                          : mlir::qco::twoQubitControlledX01();
+    return true;
+  }
+  auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+  if (!unitary || !unitary.isTwoQubit()) {
+    return false;
+  }
+  return unitary.getUnitaryMatrix4x4(matrix);
+}
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/Weyl.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Weyl.cpp
new file mode 100644
index 0000000000..3c23f466e3
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Weyl.cpp
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Weyl.h"
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/Support/ErrorHandling.h>
+#include <llvm/Support/FormatVariadic.h>
+#include <mlir/Support/LLVM.h>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <numbers>
+#include <optional>
+#include <random>
+#include <tuple>
+#include <utility>
+
+using mlir::qco::Complex;
+using mlir::qco::Matrix2x2;
+using mlir::qco::Matrix4x4;
+
+static constexpr double SANITY_CHECK_PRECISION = 1e-12;
+
+namespace mlir::qco::decomposition {
+
+using namespace std::complex_literals;
+
+namespace {
+
+enum class Specialization : std::uint8_t {
+  General,
+  IdEquiv,
+  SWAPEquiv,
+  PartialSWAPEquiv,
+  PartialSWAPFlipEquiv,
+  ControlledEquiv,
+  MirrorControlledEquiv,
+  FSimaabEquiv,
+  FSimabbEquiv,
+  FSimabmbEquiv,
+};
+
+enum class MagicBasisTransform : std::uint8_t {
+  Into,
+  OutOf,
+};
+
+} // namespace
+
+static constexpr auto DIAGONALIZATION_PRECISION = 1e-13;
+
+static Matrix4x4 magicBasisTransform(const Matrix4x4& unitary,
+                                     MagicBasisTransform direction) {
+  const Matrix4x4 bNonNormalized = Matrix4x4::fromElements( //
+      1, 1i, 0, 0,                                          //
+      0, 0, 1i, 1,                                          //
+      0, 0, 1i, -1,                                         //
+      1, -1i, 0, 0);
+  const Matrix4x4 bNonNormalizedDagger = Matrix4x4::fromElements( //
+      0.5, 0, 0, 0.5,                                             //
+      Complex{0.0, -0.5}, 0, 0, Complex{0.0, 0.5},                //
+      0, Complex{0.0, -0.5}, Complex{0.0, -0.5}, 0,               //
+      0, 0.5, -0.5, 0);
+  if (direction == MagicBasisTransform::OutOf) {
+    return bNonNormalizedDagger * unitary * bNonNormalized;
+  }
+  if (direction == MagicBasisTransform::Into) {
+    return bNonNormalized * unitary * bNonNormalizedDagger;
+  }
+  llvm::reportFatalInternalError("Unknown MagicBasisTransform direction!");
+}
+
+static double closestPartialSwap(double a, double b, double c) {
+  auto m = (a + b + c) / 3.;
+  auto [am, bm, cm] = std::array{a - m, b - m, c - m};
+  auto [ab, bc, ca] = std::array{a - b, b - c, c - a};
+  return m + (am * bm * cm * (6. + (ab * ab) + (bc * bc) + (ca * ca)) / 18.);
+}
+
+static std::pair<Matrix4x4, std::array<Complex, 4>>
+diagonalizeComplexSymmetric(const Matrix4x4& m, double precision) {
+  auto state = std::mt19937{2023};
+  std::normal_distribution<double> dist;
+
+  const auto mReal = m.realPart();
+  const auto mImag = m.imagPart();
+
+  double bestErr = 1e300;
+  constexpr auto maxDiagonalizationAttempts = 100;
+  for (int i = 0; i < maxDiagonalizationAttempts; ++i) {
+    double randA{};
+    double randB{};
+    if (i == 0) {
+      randA = 1.2602066112249388;
+      randB = 0.22317849046722027;
+    } else {
+      randA = dist(state);
+      randB = dist(state);
+    }
+    std::array<double, 16> m2Real{};
+    for (std::size_t k = 0; k < m2Real.size(); ++k) {
+      m2Real[k] = (randA * mReal[k]) + (randB * mImag[k]);
+    }
+    const Matrix4x4 p = jacobiSymmetricEigen(m2Real).eigenvectors;
+    const std::array<Complex, 4> d = (p.transpose() * m * p).diagonal();
+
+    const auto compare = p * Matrix4x4::fromDiagonal(d) * p.transpose();
+    {
+      double err = 0.0;
+      for (std::size_t r = 0; r < 4; ++r) {
+        for (std::size_t cc = 0; cc < 4; ++cc) {
+          err = std::max(err, std::abs(compare(r, cc) - m(r, cc)));
+        }
+      }
+      bestErr = std::min(bestErr, err);
+    }
+    if (compare.isApprox(m, precision)) {
+      assert((p.transpose() * p).isIdentity(SANITY_CHECK_PRECISION));
+      assert(std::abs(Matrix4x4::fromDiagonal(d).determinant() - 1.0) <
+             SANITY_CHECK_PRECISION);
+      return std::make_pair(p, d);
+    }
+  }
+  llvm::reportFatalInternalError(llvm::formatv(
+      "TwoQubitWeylDecomposition: failed to diagonalize M2 ({0} iterations). "
+      "best error = {1:e}, precision = {2:e}",
+      maxDiagonalizationAttempts, bestErr, precision));
+}
+
+static std::tuple<Matrix2x2, Matrix2x2, double>
+decomposeTwoQubitProductGate(const Matrix4x4& specialUnitary) {
+  Matrix2x2 r =
+      Matrix2x2::fromElements(specialUnitary(0, 0), specialUnitary(0, 1),
+                              specialUnitary(1, 0), specialUnitary(1, 1));
+  auto detR = r.determinant();
+  if (std::abs(detR) < 0.1) {
+    r = Matrix2x2::fromElements(specialUnitary(2, 0), specialUnitary(2, 1),
+                                specialUnitary(3, 0), specialUnitary(3, 1));
+    detR = r.determinant();
+  }
+  if (std::abs(detR) < 0.1) {
+    llvm::reportFatalInternalError(
+        "decomposeTwoQubitProductGate: unable to decompose: det_r < 0.1");
+  }
+  r *= (1.0 / std::sqrt(detR));
+  const Matrix2x2 rTConj = r.adjoint();
+
+  Matrix4x4 temp = specialUnitary * kron(Matrix2x2::identity(), rTConj);
+
+  Matrix2x2 l =
+      Matrix2x2::fromElements(temp(0, 0), temp(0, 2), temp(2, 0), temp(2, 2));
+  auto detL = l.determinant();
+  if (std::abs(detL) < 0.9) {
+    llvm::reportFatalInternalError(
+        "decomposeTwoQubitProductGate: unable to decompose: detL < 0.9");
+  }
+  l *= (1.0 / std::sqrt(detL));
+  auto phase = std::arg(detL) / 2.;
+
+  return {l, r, phase};
+}
+
+static std::complex<double> getTrace(double a, double b, double c, double ap,
+                                     double bp, double cp) {
+  auto da = a - ap;
+  auto db = b - bp;
+  auto dc = c - cp;
+  return 4. * std::complex<double>{std::cos(da) * std::cos(db) * std::cos(dc),
+                                   std::sin(da) * std::sin(db) * std::sin(dc)};
+}
+
+static Specialization
+bestSpecialization(const TwoQubitWeylDecomposition& decomposition,
+                   const std::optional<double>& requestedFidelity) {
+  auto isClose = [&](double ap, double bp, double cp) -> bool {
+    auto tr = getTrace(decomposition.a(), decomposition.b(), decomposition.c(),
+                       ap, bp, cp);
+    if (requestedFidelity) {
+      return traceToFidelity(tr) >= *requestedFidelity;
+    }
+    return false;
+  };
+
+  auto closestAbc = closestPartialSwap(decomposition.a(), decomposition.b(),
+                                       decomposition.c());
+  auto closestAbMinusC = closestPartialSwap(
+      decomposition.a(), decomposition.b(), -decomposition.c());
+
+  if (isClose(0., 0., 0.)) {
+    return Specialization::IdEquiv;
+  }
+  if (isClose((std::numbers::pi / 4.0), (std::numbers::pi / 4.0),
+              (std::numbers::pi / 4.0)) ||
+      isClose((std::numbers::pi / 4.0), (std::numbers::pi / 4.0),
+              -(std::numbers::pi / 4.0))) {
+    return Specialization::SWAPEquiv;
+  }
+  if (isClose(closestAbc, closestAbc, closestAbc)) {
+    return Specialization::PartialSWAPEquiv;
+  }
+  if (isClose(closestAbMinusC, closestAbMinusC, -closestAbMinusC)) {
+    return Specialization::PartialSWAPFlipEquiv;
+  }
+  if (isClose(decomposition.a(), 0., 0.)) {
+    return Specialization::ControlledEquiv;
+  }
+  if (isClose((std::numbers::pi / 4.0), (std::numbers::pi / 4.0),
+              decomposition.c())) {
+    return Specialization::MirrorControlledEquiv;
+  }
+  if (isClose((decomposition.a() + decomposition.b()) / 2.,
+              (decomposition.a() + decomposition.b()) / 2.,
+              decomposition.c())) {
+    return Specialization::FSimaabEquiv;
+  }
+  if (isClose(decomposition.a(), (decomposition.b() + decomposition.c()) / 2.,
+              (decomposition.b() + decomposition.c()) / 2.)) {
+    return Specialization::FSimabbEquiv;
+  }
+  if (isClose(decomposition.a(), (decomposition.b() - decomposition.c()) / 2.,
+              (decomposition.c() - decomposition.b()) / 2.)) {
+    return Specialization::FSimabmbEquiv;
+  }
+  return Specialization::General;
+}
+
+static bool relativeEq(double lhs, double rhs, double epsilon,
+                       double maxRelative) {
+  if (lhs == rhs) {
+    return true;
+  }
+  if (std::isinf(lhs) || std::isinf(rhs)) {
+    return false;
+  }
+  auto absDiff = std::abs(lhs - rhs);
+  if (absDiff <= epsilon) {
+    return true;
+  }
+  auto absLhs = std::abs(lhs);
+  auto absRhs = std::abs(rhs);
+  if (absRhs > absLhs) {
+    return absDiff <= absRhs * maxRelative;
+  }
+  return absDiff <= absLhs * maxRelative;
+}
+
+TwoQubitWeylDecomposition
+TwoQubitWeylDecomposition::create(const Matrix4x4& unitaryMatrix,
+                                  std::optional<double> fidelity) {
+  auto u = unitaryMatrix;
+  auto detU = u.determinant();
+  // Project into SU(4) by dividing out the fourth root of det(U): for a 4x4
+  // unitary, |det(U)| == 1 so `det^{-1/4}` both enforces det == 1 and removes
+  // the global phase. The extracted phase is tracked separately in
+  // `globalPhase` (quarter of arg(det) to match the fourth-root choice) so the
+  // caller can reconstruct the original matrix exactly if needed.
+  auto detPow = std::pow(detU, -0.25);
+  u *= detPow; // remove global phase from unitary matrix
+  auto globalPhase = std::arg(detU) / 4.;
+
+  // Numerical drift can still leave tiny determinant errors after root
+  // normalization. Re-normalize once more instead of aborting.
+  auto detNormalized = u.determinant();
+  if (std::abs(detNormalized - Complex{1.0, 0.0}) > SANITY_CHECK_PRECISION &&
+      std::abs(detNormalized) > SANITY_CHECK_PRECISION) {
+    u *= std::pow(detNormalized, -0.25);
+  }
+
+  // transform unitary matrix to magic basis; this enables two properties:
+  // 1. if uP ∈ SO(4), V = A ⊗ B (SO(4) → SU(2) ⊗ SU(2))
+  // 2. magic basis diagonalizes canonical gate, allowing calculation of
+  //    canonical gate parameters later on
+  auto uP = magicBasisTransform(u, MagicBasisTransform::OutOf);
+  const Matrix4x4 m2 = uP.transpose() * uP;
+
+  // diagonalization yields eigenvectors (p) and eigenvalues (d);
+  // p is used to calculate K1/K2 (and thus the single-qubit gates
+  // surrounding the canonical gate); d is used to determine the Weyl
+  // coordinates and thus the parameters of the canonical gate
+  auto [p, d] = diagonalizeComplexSymmetric(m2, DIAGONALIZATION_PRECISION);
+
+  // extract Weyl coordinates from eigenvalues, map to [0, 2*pi)
+  constexpr double pi = std::numbers::pi;
+  std::array<double, 4> dReal{};
+  for (std::size_t i = 0; i < d.size(); ++i) {
+    dReal[i] = -std::arg(d[i]) / 2.0;
+  }
+  dReal[3] = -dReal[0] - dReal[1] - dReal[2];
+  std::array<double, 3> cs{};
+  for (std::size_t i = 0; i < cs.size(); ++i) {
+    cs[i] = remEuclid((dReal[i] + dReal[3]) / 2.0, 2.0 * pi);
+  }
+
+  // Reorder coordinates according to min(a, pi/2 - a) with
+  // a = x mod pi/2 for each Weyl coordinate x
+  std::array<double, 3> cstemp{};
+  for (std::size_t i = 0; i < cs.size(); ++i) {
+    const auto tmp = remEuclid(cs[i], pi / 2.0);
+    cstemp[i] = std::min(tmp, (pi / 2.0) - tmp);
+  }
+  std::array<std::size_t, 3> order{0, 1, 2};
+  std::ranges::stable_sort(
+      order, [&](auto a, auto b) { return cstemp[a] < cstemp[b]; });
+  order = {order[1], order[2], order[0]};
+  cs = {cs[order[0]], cs[order[1]], cs[order[2]]};
+  {
+    const std::array<double, 3> reordered{dReal[order[0]], dReal[order[1]],
+                                          dReal[order[2]]};
+    dReal[0] = reordered[0];
+    dReal[1] = reordered[1];
+    dReal[2] = reordered[2];
+  }
+
+  // update eigenvectors (columns of p) according to new order of
+  // weyl coordinates
+  const Matrix4x4 pOrig = p;
+  for (std::size_t i = 0; i < order.size(); ++i) {
+    p.setColumn(i, pOrig.column(order[i]));
+  }
+  // apply correction for determinant if necessary
+  if (p.determinant().real() < 0.0) {
+    auto lastColumn = p.column(3);
+    for (auto& entry : lastColumn) {
+      entry = -entry;
+    }
+    p.setColumn(3, lastColumn);
+  }
+  assert(std::abs(p.determinant() - 1.0) < SANITY_CHECK_PRECISION);
+
+  // re-create complex eigenvalue matrix; this matrix contains the
+  // parameters of the canonical gate which is later used in the
+  // verification. Since the matrix is diagonal, the matrix exponential is
+  // equivalent to the element-wise exponential function.
+  std::array<Complex, 4> tempDiag{};
+  for (std::size_t k = 0; k < tempDiag.size(); ++k) {
+    tempDiag[k] = std::exp(1i * dReal[k]);
+  }
+  const Matrix4x4 temp = Matrix4x4::fromDiagonal(tempDiag);
+
+  // combined matrix k1 of 1q gates after canonical gate
+  Matrix4x4 k1 = uP * p * temp;
+  // k1 must be orthogonal; the tolerance matches the iterative diagonalization
+  // residual rather than the (much tighter) default matrix tolerance.
+  assert((k1.transpose() * k1).isIdentity(SANITY_CHECK_PRECISION));
+  assert(k1.determinant().real() > 0.0);
+  k1 = magicBasisTransform(k1, MagicBasisTransform::Into);
+
+  // combined matrix k2 of 1q gates before canonical gate
+  Matrix4x4 k2 = p.adjoint();
+  // k2 must be orthogonal; see the tolerance note on the k1 check above.
+  assert((k2.transpose() * k2).isIdentity(SANITY_CHECK_PRECISION));
+  assert(k2.determinant().real() > 0.0);
+  k2 = magicBasisTransform(k2, MagicBasisTransform::Into);
+
+  // ensure k1 and k2 are correct (when combined with the canonical gate
+  // parameters in-between, they are equivalent to u)
+  std::array<Complex, 4> tempConjDiag{};
+  for (std::size_t k = 0; k < tempConjDiag.size(); ++k) {
+    tempConjDiag[k] = std::conj(tempDiag[k]);
+  }
+  assert((k1 *
+          magicBasisTransform(Matrix4x4::fromDiagonal(tempConjDiag),
+                              MagicBasisTransform::Into) *
+          k2)
+             .isApprox(u, SANITY_CHECK_PRECISION));
+
+  // calculate k1 = K1l ⊗ K1r
+  auto [K1l, K1r, phaseL] = decomposeTwoQubitProductGate(k1);
+  // decompose k2 = K2l ⊗ K2r
+  auto [K2l, K2r, phaseR] = decomposeTwoQubitProductGate(k2);
+  assert(kron(K1l, K1r).isApprox(k1, SANITY_CHECK_PRECISION));
+  assert(kron(K2l, K2r).isApprox(k2, SANITY_CHECK_PRECISION));
+  // accumulate global phase
+  globalPhase += phaseL + phaseR;
+
+  // Flip into Weyl chamber
+  if (cs[0] > (pi / 2.0)) {
+    cs[0] -= 3.0 * (pi / 2.0);
+    K1l = K1l * iPauliY();
+    K1r = K1r * iPauliY();
+    globalPhase += (pi / 2.0);
+  }
+  if (cs[1] > (pi / 2.0)) {
+    cs[1] -= 3.0 * (pi / 2.0);
+    K1l = K1l * iPauliX();
+    K1r = K1r * iPauliX();
+    globalPhase += (pi / 2.0);
+  }
+  auto conjs = 0;
+  if (cs[0] > (pi / 4.0)) {
+    cs[0] = (pi / 2.0) - cs[0];
+    K1l = K1l * iPauliY();
+    K2r = iPauliY() * K2r;
+    conjs += 1;
+    globalPhase -= (pi / 2.0);
+  }
+  if (cs[1] > (pi / 4.0)) {
+    cs[1] = (pi / 2.0) - cs[1];
+    K1l = K1l * iPauliX();
+    K2r = iPauliX() * K2r;
+    conjs += 1;
+    globalPhase += (pi / 2.0);
+    if (conjs == 1) {
+      globalPhase -= pi;
+    }
+  }
+  if (cs[2] > (pi / 2.0)) {
+    cs[2] -= 3.0 * (pi / 2.0);
+    K1l = K1l * iPauliZ();
+    K1r = K1r * iPauliZ();
+    globalPhase += (pi / 2.0);
+    if (conjs == 1) {
+      globalPhase -= pi;
+    }
+  }
+  if (conjs == 1) {
+    cs[2] = (pi / 2.0) - cs[2];
+    K1l = K1l * iPauliZ();
+    K2r = iPauliZ() * K2r;
+    globalPhase += (pi / 2.0);
+  }
+  if (cs[2] > (pi / 4.0)) {
+    cs[2] -= (pi / 2.0);
+    K1l = K1l * iPauliZ();
+    K1r = K1r * iPauliZ();
+    globalPhase -= (pi / 2.0);
+  }
+
+  // bind weyl coordinates as parameters of canonical gate
+  auto [a, b, c] = std::tie(cs[1], cs[0], cs[2]);
+
+  TwoQubitWeylDecomposition decomposition;
+  decomposition.a_ = a;
+  decomposition.b_ = b;
+  decomposition.c_ = c;
+  decomposition.globalPhase_ = globalPhase;
+  decomposition.k1l_ = K1l;
+  decomposition.k2l_ = K2l;
+  decomposition.k1r_ = K1r;
+  decomposition.k2r_ = K2r;
+  decomposition.specializationKind_ =
+      static_cast<std::uint8_t>(Specialization::General);
+  decomposition.requestedFidelity = fidelity;
+
+  // make sure decomposition is equal to input
+  assert((kron(K1l, K1r) * decomposition.getCanonicalMatrix() * kron(K2l, K2r) *
+          globalPhaseFactor(globalPhase))
+             .isApprox(unitaryMatrix, SANITY_CHECK_PRECISION));
+
+  // determine actual specialization of canonical gate so that the 1q
+  // matrices can potentially be simplified
+  auto flippedFromOriginal = decomposition.applySpecialization();
+
+  auto getTraceValue = [&]() {
+    if (flippedFromOriginal) {
+      return getTrace((pi / 2.0) - a, b, -c, decomposition.a_, decomposition.b_,
+                      decomposition.c_);
+    }
+    return getTrace(a, b, c, decomposition.a_, decomposition.b_,
+                    decomposition.c_);
+  };
+  // use trace to calculate fidelity of applied specialization and
+  // adjust global phase
+  auto trace = getTraceValue();
+  const double calculatedFidelity = traceToFidelity(trace);
+  // final check if specialization is close enough to the original matrix to
+  // satisfy the requested fidelity; since no forced specialization is
+  // allowed, this should never fail
+  if (decomposition.requestedFidelity &&
+      calculatedFidelity + 1.0e-13 < *decomposition.requestedFidelity) {
+    llvm::reportFatalInternalError(llvm::formatv(
+        "TwoQubitWeylDecomposition: Calculated fidelity of "
+        "specialization is worse than requested fidelity ({0:F4} vs {1:F4})!",
+        calculatedFidelity, *decomposition.requestedFidelity));
+  }
+  decomposition.globalPhase_ += std::arg(trace);
+
+  // final check if decomposition is still valid after specialization
+  assert((kron(decomposition.k1l_, decomposition.k1r_) *
+          decomposition.getCanonicalMatrix() *
+          kron(decomposition.k2l_, decomposition.k2r_) *
+          globalPhaseFactor(decomposition.globalPhase_))
+             .isApprox(unitaryMatrix, SANITY_CHECK_PRECISION));
+
+  return decomposition;
+}
+
+Matrix4x4 TwoQubitWeylDecomposition::getCanonicalMatrix(double a, double b,
+                                                        double c) {
+  // Canonical gate `U_d(a, b, c) = exp(i * (a*XX + b*YY + c*ZZ))`. XX/YY/ZZ
+  // commute pairwise, so any product order is equivalent; the order below is
+  // chosen to match common Qiskit/QuantumFlow references. The negated rotation
+  // angles (`-2 * a`, ...) compensate for the `RXX/RYY/RZZ` convention
+  // `exp(-i * theta/2 * XX)`, so that the factored angles sum back to the
+  // intended `+a`, `+b`, `+c`.
+  const auto xx = rxxMatrix(-2.0 * a);
+  const auto yy = ryyMatrix(-2.0 * b);
+  const auto zz = rzzMatrix(-2.0 * c);
+  return zz * yy * xx;
+}
+
+bool TwoQubitWeylDecomposition::applySpecialization() {
+  if (specializationKind_ !=
+      static_cast<std::uint8_t>(Specialization::General)) {
+    llvm::reportFatalInternalError(
+        "Application of specialization only works on "
+        "general Weyl decompositions!");
+  }
+  bool flippedFromOriginal = false;
+  const auto newSpecialization = bestSpecialization(*this, requestedFidelity);
+  if (newSpecialization == Specialization::General) {
+    // U has no special symmetry.
+    //
+    // This gate binds all 6 possible parameters, so there is no need to
+    // make the single-qubit pre-/post-gates canonical.
+    return flippedFromOriginal;
+  }
+  specializationKind_ = static_cast<std::uint8_t>(newSpecialization);
+
+  if (newSpecialization == Specialization::IdEquiv) {
+    // :math:`U \sim U_d(0,0,0)`
+    // Thus, :math:`\sim Id`
+    //
+    // This gate binds 0 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id` , :math:`K2_r = Id`.
+    a_ = 0.;
+    b_ = 0.;
+    c_ = 0.;
+    // unmodified global phase
+    k1l_ = k1l_ * k2l_;
+    k2l_ = Matrix2x2::identity();
+    k1r_ = k1r_ * k2r_;
+    k2r_ = Matrix2x2::identity();
+  } else if (newSpecialization == Specialization::SWAPEquiv) {
+    // :math:`U \sim U_d(\pi/4, \pi/4, \pi/4) \sim U(\pi/4, \pi/4, -\pi/4)`
+    // Thus, :math:`U \sim \text{SWAP}`
+    //
+    // This gate binds 0 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id` , :math:`K2_r = Id`.
+    if (c_ > 0.) {
+      // unmodified global phase
+      k1l_ = k1l_ * k2r_;
+      k1r_ = k1r_ * k2l_;
+      k2l_ = Matrix2x2::identity();
+      k2r_ = Matrix2x2::identity();
+    } else {
+      flippedFromOriginal = true;
+
+      globalPhase_ += (std::numbers::pi / 2.0);
+      k1l_ = k1l_ * iPauliZ() * k2r_;
+      k1r_ = k1r_ * iPauliZ() * k2l_;
+      k2l_ = Matrix2x2::identity();
+      k2r_ = Matrix2x2::identity();
+    }
+    a_ = (std::numbers::pi / 4.0);
+    b_ = (std::numbers::pi / 4.0);
+    c_ = (std::numbers::pi / 4.0);
+  } else if (newSpecialization == Specialization::PartialSWAPEquiv) {
+    // :math:`U \sim U_d(\alpha\pi/4, \alpha\pi/4, \alpha\pi/4)`
+    // Thus, :math:`U \sim \text{SWAP}^\alpha`
+    //
+    // This gate binds 3 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id`.
+    auto closest = closestPartialSwap(a_, b_, c_);
+    auto k2lDagger = k2l_.adjoint();
+
+    a_ = closest;
+    b_ = closest;
+    c_ = closest;
+    // unmodified global phase
+    k1l_ = k1l_ * k2l_;
+    k1r_ = k1r_ * k2l_;
+    k2r_ = k2lDagger * k2r_;
+    k2l_ = Matrix2x2::identity();
+  } else if (newSpecialization == Specialization::PartialSWAPFlipEquiv) {
+    // :math:`U \sim U_d(\alpha\pi/4, \alpha\pi/4, -\alpha\pi/4)`
+    // Thus, :math:`U \sim \text{SWAP}^\alpha`
+    //
+    // (a non-equivalent root of SWAP from the TwoQubitWeylPartialSWAPEquiv
+    // similar to how :math:`x = (\pm \sqrt(x))^2`)
+    //
+    // This gate binds 3 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id`
+    auto closest = closestPartialSwap(a_, b_, -c_);
+    auto k2lDagger = k2l_.adjoint();
+
+    a_ = closest;
+    b_ = closest;
+    c_ = -closest;
+    // unmodified global phase
+    k1l_ = k1l_ * k2l_;
+    k1r_ = k1r_ * iPauliZ() * k2l_ * iPauliZ();
+    k2r_ = iPauliZ() * k2lDagger * iPauliZ() * k2r_;
+    k2l_ = Matrix2x2::identity();
+  } else if (newSpecialization == Specialization::ControlledEquiv) {
+    // :math:`U \sim U_d(\alpha, 0, 0)`
+    // Thus, :math:`U \sim \text{Ctrl-U}`
+    //
+    // This gate binds 4 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) Rx(\lambda_l)`
+    // :math:`K2_r = Ry(\theta_r) Rx(\lambda_r)`
+    const EulerBasis eulerBasis = EulerBasis::XYX;
+    const auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, eulerBasis);
+    const auto [k2rtheta, k2rphi, k2rlambda, k2rphase] =
+        anglesFromUnitary(k2r_, EulerBasis::XYX);
+    // unmodified parameter a
+    b_ = 0.;
+    c_ = 0.;
+    globalPhase_ = globalPhase_ + k2lphase + k2rphase;
+    k1l_ = k1l_ * rxMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rxMatrix(k2llambda);
+    k1r_ = k1r_ * rxMatrix(k2rphi);
+    k2r_ = ryMatrix(k2rtheta) * rxMatrix(k2rlambda);
+  } else if (newSpecialization == Specialization::MirrorControlledEquiv) {
+    // :math:`U \sim U_d(\pi/4, \pi/4, \alpha)`
+    // Thus, :math:`U \sim \text{SWAP} \cdot \text{Ctrl-U}`
+    //
+    // This gate binds 4 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l)\cdot Rz(\lambda_l)`
+    // :math:`K2_r = Ry(\theta_r)\cdot Rz(\lambda_r)`
+    const auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, EulerBasis::ZYZ);
+    const auto [k2rtheta, k2rphi, k2rlambda, k2rphase] =
+        anglesFromUnitary(k2r_, EulerBasis::ZYZ);
+    a_ = (std::numbers::pi / 4.0);
+    b_ = (std::numbers::pi / 4.0);
+    // unmodified parameter c
+    globalPhase_ = globalPhase_ + k2lphase + k2rphase;
+    k1l_ = k1l_ * rzMatrix(k2rphi);
+    k2l_ = ryMatrix(k2ltheta) * rzMatrix(k2llambda);
+    k1r_ = k1r_ * rzMatrix(k2lphi);
+    k2r_ = ryMatrix(k2rtheta) * rzMatrix(k2rlambda);
+  } else if (newSpecialization == Specialization::FSimaabEquiv) {
+    // :math:`U \sim U_d(\alpha, \alpha, \beta), \alpha \geq |\beta|`
+    //
+    // This gate binds 5 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) \cdot Rz(\lambda_l)`.
+    auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, EulerBasis::ZYZ);
+    auto ab = (a_ + b_) / 2.;
+
+    a_ = ab;
+    b_ = ab;
+    // unmodified parameter c
+    globalPhase_ = globalPhase_ + k2lphase;
+    k1l_ = k1l_ * rzMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rzMatrix(k2llambda);
+    k1r_ = k1r_ * rzMatrix(k2lphi);
+    k2r_ = rzMatrix(-k2lphi) * k2r_;
+  } else if (newSpecialization == Specialization::FSimabbEquiv) {
+    // :math:`U \sim U_d(\alpha, \beta, \beta), \alpha \geq \beta \geq 0`
+    //
+    // This gate binds 5 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) \cdot Rx(\lambda_l)`
+    auto eulerBasis = EulerBasis::XYX;
+    auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, eulerBasis);
+    auto bc = (b_ + c_) / 2.;
+
+    // unmodified parameter a
+    b_ = bc;
+    c_ = bc;
+    globalPhase_ = globalPhase_ + k2lphase;
+    k1l_ = k1l_ * rxMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rxMatrix(k2llambda);
+    k1r_ = k1r_ * rxMatrix(k2lphi);
+    k2r_ = rxMatrix(-k2lphi) * k2r_;
+  } else if (newSpecialization == Specialization::FSimabmbEquiv) {
+    // :math:`U \sim U_d(\alpha, \beta, -\beta), \alpha \geq \beta \geq 0`
+    //
+    // This gate binds 5 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) \cdot Rx(\lambda_l)`
+    auto eulerBasis = EulerBasis::XYX;
+    auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, eulerBasis);
+    auto bc = (b_ - c_) / 2.;
+
+    // unmodified parameter a
+    b_ = bc;
+    c_ = -bc;
+    globalPhase_ = globalPhase_ + k2lphase;
+    k1l_ = k1l_ * rxMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rxMatrix(k2llambda);
+    k1r_ = k1r_ * iPauliZ() * rxMatrix(k2lphi) * iPauliZ();
+    k2r_ = iPauliZ() * rxMatrix(-k2lphi) * iPauliZ() * k2r_;
+  } else {
+    llvm::reportFatalInternalError(
+        "Unknown specialization for Weyl decomposition!");
+  }
+  return flippedFromOriginal;
+}
+
+TwoQubitBasisDecomposer
+TwoQubitBasisDecomposer::create(const Matrix4x4& basisMatrix,
+                                double basisFidelity) {
+  const Matrix2x2 k12RArr = Matrix2x2::fromElements(
+      1i * FRAC1_SQRT2, FRAC1_SQRT2, -FRAC1_SQRT2, -1i * FRAC1_SQRT2);
+  const Matrix2x2 k12LArr =
+      Matrix2x2::fromElements(Complex{0.5, 0.5}, Complex{0.5, 0.5},
+                              Complex{-0.5, 0.5}, Complex{0.5, -0.5});
+
+  const auto basisDecomposer =
+      TwoQubitWeylDecomposition::create(basisMatrix, basisFidelity);
+  const auto isSuperControlled =
+      relativeEq(basisDecomposer.a(), std::numbers::pi / 4.0, 1e-13, 1e-09) &&
+      relativeEq(basisDecomposer.c(), 0.0, 1e-13, 1e-09);
+
+  // Create some useful matrices U1, U2, U3 are equivalent to the basis,
+  // expand as Ui = Ki1.Ubasis.Ki2
+  auto b = basisDecomposer.b();
+  Complex temp{0.5, -0.5};
+  const Matrix2x2 k11l = Matrix2x2::fromElements(
+      temp * (-1i * std::exp(-1i * b)), temp * std::exp(-1i * b),
+      temp * (-1i * std::exp(1i * b)), temp * -std::exp(1i * b));
+  const Matrix2x2 k11r = Matrix2x2::fromElements(
+      FRAC1_SQRT2 * (1i * std::exp(-1i * b)), FRAC1_SQRT2 * -std::exp(-1i * b),
+      FRAC1_SQRT2 * std::exp(1i * b), FRAC1_SQRT2 * (-1i * std::exp(1i * b)));
+  const Matrix2x2 k32lK21l =
+      Matrix2x2::fromElements(FRAC1_SQRT2 * Complex{1., std::cos(2. * b)},
+                              FRAC1_SQRT2 * (1i * std::sin(2. * b)),
+                              FRAC1_SQRT2 * (1i * std::sin(2. * b)),
+                              FRAC1_SQRT2 * Complex{1., -std::cos(2. * b)});
+  temp = Complex{0.5, 0.5};
+  const Matrix2x2 k21r = Matrix2x2::fromElements(
+      temp * (-1i * std::exp(-2i * b)), temp * std::exp(-2i * b),
+      temp * (1i * std::exp(2i * b)), temp * std::exp(2i * b));
+  const Matrix2x2 k22l = Matrix2x2::fromElements(FRAC1_SQRT2, -FRAC1_SQRT2,
+                                                 FRAC1_SQRT2, FRAC1_SQRT2);
+  const Matrix2x2 k22r = Matrix2x2::fromElements(0, 1, -1, 0);
+  const Matrix2x2 k31l = Matrix2x2::fromElements(
+      FRAC1_SQRT2 * std::exp(-1i * b), FRAC1_SQRT2 * std::exp(-1i * b),
+      FRAC1_SQRT2 * -std::exp(1i * b), FRAC1_SQRT2 * std::exp(1i * b));
+  const Matrix2x2 k31r = Matrix2x2::fromElements(1i * std::exp(1i * b), 0, 0,
+                                                 -1i * std::exp(-1i * b));
+  const Matrix2x2 k32r = Matrix2x2::fromElements(
+      temp * std::exp(1i * b), temp * -std::exp(-1i * b),
+      temp * (-1i * std::exp(1i * b)), temp * (-1i * std::exp(-1i * b)));
+  auto k1lDagger = basisDecomposer.k1l().adjoint();
+  auto k1rDagger = basisDecomposer.k1r().adjoint();
+  auto k2lDagger = basisDecomposer.k2l().adjoint();
+  auto k2rDagger = basisDecomposer.k2r().adjoint();
+  // Pre-build the fixed parts of the matrices used in 3-part decomposition
+  auto u0l = k31l * k1lDagger;
+  auto u0r = k31r * k1rDagger;
+  auto u1l = k2lDagger * k32lK21l * k1lDagger;
+  auto u1ra = k2rDagger * k32r;
+  auto u1rb = k21r * k1rDagger;
+  auto u2la = k2lDagger * k22l;
+  auto u2lb = k11l * k1lDagger;
+  auto u2ra = k2rDagger * k22r;
+  auto u2rb = k11r * k1rDagger;
+  auto u3l = k2lDagger * k12LArr;
+  auto u3r = k2rDagger * k12RArr;
+  // Pre-build the fixed parts of the matrices used in the 2-part decomposition
+  auto q0l = k12LArr.adjoint() * k1lDagger;
+  auto q0r = k12RArr.adjoint() * iPauliZ() * k1rDagger;
+  auto q1la = k2lDagger * k11l.adjoint();
+  auto q1lb = k11l * k1lDagger;
+  auto q1ra = k2rDagger * iPauliZ() * k11r.adjoint();
+  auto q1rb = k11r * k1rDagger;
+  auto q2l = k2lDagger * k12LArr;
+  auto q2r = k2rDagger * k12RArr;
+
+  TwoQubitBasisDecomposer decomposer;
+  decomposer.basisFidelity = basisFidelity;
+  decomposer.basisDecomposer = basisDecomposer;
+  decomposer.isSuperControlled = isSuperControlled;
+  decomposer.smb.u0l = u0l;
+  decomposer.smb.u0r = u0r;
+  decomposer.smb.u1l = u1l;
+  decomposer.smb.u1ra = u1ra;
+  decomposer.smb.u1rb = u1rb;
+  decomposer.smb.u2la = u2la;
+  decomposer.smb.u2lb = u2lb;
+  decomposer.smb.u2ra = u2ra;
+  decomposer.smb.u2rb = u2rb;
+  decomposer.smb.u3l = u3l;
+  decomposer.smb.u3r = u3r;
+  decomposer.smb.q0l = q0l;
+  decomposer.smb.q0r = q0r;
+  decomposer.smb.q1la = q1la;
+  decomposer.smb.q1lb = q1lb;
+  decomposer.smb.q1ra = q1ra;
+  decomposer.smb.q1rb = q1rb;
+  decomposer.smb.q2l = q2l;
+  decomposer.smb.q2r = q2r;
+  return decomposer;
+}
+
+std::optional<TwoQubitNativeDecomposition>
+TwoQubitBasisDecomposer::twoQubitDecompose(
+    const TwoQubitWeylDecomposition& targetDecomposition,
+    std::optional<std::uint8_t> numBasisGateUses) const {
+  auto traces = this->traces(targetDecomposition);
+  auto getDefaultNbasis = [&]() -> std::uint8_t {
+    // Pick the number of basis gate uses `i ∈ {0, 1, 2, 3}` that maximizes
+    //   expected_fidelity(i) = traceToFidelity(traces[i]) * basisFidelity^i.
+    auto bestValue = std::numeric_limits<double>::lowest();
+    auto bestIndex = -1;
+    for (int i = 0; std::cmp_less(i, traces.size()); ++i) {
+      auto value = traceToFidelity(traces[i]) * std::pow(basisFidelity, i);
+      if (std::isnan(value)) {
+        continue;
+      }
+      if (value > bestValue) {
+        bestIndex = i;
+        bestValue = value;
+      }
+    }
+    if (bestIndex < 0) {
+      llvm::reportFatalInternalError("Unable to select basis-gate count: all "
+                                     "candidate fidelities are NaN");
+    }
+    return static_cast<std::uint8_t>(bestIndex);
+  };
+  // number of basis gates that need to be used in the decomposition
+  auto bestNbasis = numBasisGateUses.value_or(getDefaultNbasis());
+  if (bestNbasis > 1 && !isSuperControlled) {
+    // cannot reliably decompose with more than one basis gate and a
+    // non-super-controlled basis gate
+    return std::nullopt;
+  }
+  auto chooseDecomposition = [&]() {
+    if (bestNbasis == 0) {
+      return decomp0(targetDecomposition);
+    }
+    if (bestNbasis == 1) {
+      return decomp1(targetDecomposition);
+    }
+    if (bestNbasis == 2) {
+      return decomp2Supercontrolled(targetDecomposition);
+    }
+    if (bestNbasis == 3) {
+      return decomp3Supercontrolled(targetDecomposition);
+    }
+    llvm::reportFatalInternalError(
+        "Invalid number of basis gates to use in basis decomposition (" +
+        llvm::Twine(bestNbasis) + ")!");
+    llvm_unreachable("");
+  };
+  TwoQubitLocalUnitaryList factors = chooseDecomposition();
+#ifndef NDEBUG
+  for (const auto& factor : factors) {
+    assert(isUnitaryMatrix(factor, 1e-12));
+  }
+#endif
+
+  double globalPhase = targetDecomposition.globalPhase();
+  globalPhase -= bestNbasis * basisDecomposer.globalPhase();
+  if (bestNbasis == 2) {
+    // The two-basis (2x CX/CZ) template in `decomp2Supercontrolled` produces
+    // a sequence whose global phase is off by `pi` relative to the target;
+    // compensate here so the emitted sequence reproduces the target unitary
+    // exactly, not just up to sign.
+    globalPhase += std::numbers::pi;
+  }
+  // large global phases can be generated by the decomposition, thus limit
+  // it to [0, +2*pi)
+  globalPhase = remEuclid(globalPhase, 2.0 * std::numbers::pi);
+
+  return TwoQubitNativeDecomposition{
+      .numBasisUses = bestNbasis,
+      .singleQubitFactors = std::move(factors),
+      .globalPhase = globalPhase,
+  };
+}
+
+TwoQubitLocalUnitaryList
+TwoQubitBasisDecomposer::decomp0(const TwoQubitWeylDecomposition& target) {
+  return TwoQubitLocalUnitaryList{
+      target.k1r() * target.k2r(),
+      target.k1l() * target.k2l(),
+  };
+}
+
+TwoQubitLocalUnitaryList TwoQubitBasisDecomposer::decomp1(
+    const TwoQubitWeylDecomposition& target) const {
+  // may not work for z != 0 and c != 0 (not always in Weyl chamber)
+  return TwoQubitLocalUnitaryList{
+      basisDecomposer.k2r().adjoint() * target.k2r(),
+      basisDecomposer.k2l().adjoint() * target.k2l(),
+      target.k1r() * basisDecomposer.k1r().adjoint(),
+      target.k1l() * basisDecomposer.k1l().adjoint(),
+  };
+}
+
+TwoQubitLocalUnitaryList TwoQubitBasisDecomposer::decomp2Supercontrolled(
+    const TwoQubitWeylDecomposition& target) const {
+  if (!isSuperControlled) {
+    llvm::reportFatalInternalError(
+        "Basis gate of TwoQubitBasisDecomposer is not super-controlled "
+        "- no guarantee for exact decomposition with two basis gates");
+  }
+  return TwoQubitLocalUnitaryList{
+      smb.q2r * target.k2r(),
+      smb.q2l * target.k2l(),
+      smb.q1ra * rzMatrix(2. * target.b()) * smb.q1rb,
+      smb.q1la * rzMatrix(-2. * target.a()) * smb.q1lb,
+      target.k1r() * smb.q0r,
+      target.k1l() * smb.q0l,
+  };
+}
+
+TwoQubitLocalUnitaryList TwoQubitBasisDecomposer::decomp3Supercontrolled(
+    const TwoQubitWeylDecomposition& target) const {
+  if (!isSuperControlled) {
+    llvm::reportFatalInternalError(
+        "Basis gate of TwoQubitBasisDecomposer is not super-controlled "
+        "- no guarantee for exact decomposition with three basis gates");
+  }
+  return TwoQubitLocalUnitaryList{
+      smb.u3r * target.k2r(),
+      smb.u3l * target.k2l(),
+      smb.u2ra * rzMatrix(2. * target.b()) * smb.u2rb,
+      smb.u2la * rzMatrix(-2. * target.a()) * smb.u2lb,
+      smb.u1ra * rzMatrix(-2. * target.c()) * smb.u1rb,
+      smb.u1l,
+      target.k1r() * smb.u0r,
+      target.k1l() * smb.u0l,
+  };
+}
+
+std::array<std::complex<double>, 4>
+TwoQubitBasisDecomposer::traces(const TwoQubitWeylDecomposition& target) const {
+  // Returns the Hilbert-Schmidt traces between the target canonical gate and
+  // the best candidate reachable with `0, 1, 2, 3` uses of the basis gate,
+  // respectively. Fed into `traceToFidelity` by `getDefaultNbasis` to pick
+  // the best basis-gate count. The closed-form expressions specialize
+  // `TwoQubitWeylDecomposition::getTrace(a, b, c, ap, bp, cp)` for:
+  //   i == 0: no basis gate       (ap == bp == cp == 0)
+  //   i == 1: one basis use       (ap == pi/4, bp == basis.b, cp == 0)
+  //   i == 2: two basis uses      (ap == 0, bp == 0, cp == -target.c)
+  //   i == 3: three basis uses    (target reachable exactly -> trace == 4)
+  // so the array has length 4 and is indexed by the number of basis uses.
+  return {
+      4. * std::complex<double>{std::cos(target.a()) * std::cos(target.b()) *
+                                    std::cos(target.c()),
+                                std::sin(target.a()) * std::sin(target.b()) *
+                                    std::sin(target.c())},
+      4. *
+          std::complex<double>{std::cos((std::numbers::pi / 4.0) - target.a()) *
+                                   std::cos(basisDecomposer.b() - target.b()) *
+                                   std::cos(target.c()),
+                               std::sin((std::numbers::pi / 4.0) - target.a()) *
+                                   std::sin(basisDecomposer.b() - target.b()) *
+                                   std::sin(target.c())},
+      std::complex<double>{4. * std::cos(target.c()), 0.},
+      std::complex<double>{4., 0.},
+  };
+}
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.cpp
new file mode 100644
index 0000000000..41fe85847c
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/NativeProfile.h"
+#include "mlir/Dialect/QCO/Transforms/Passes.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LLVM.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Support/WalkResult.h>
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+namespace mlir::qco {
+
+#define GEN_PASS_DEF_FUSETWOQUBITUNITARYRUNS
+#include "mlir/Dialect/QCO/Transforms/Passes.h.inc"
+
+/// Skips unitaries nested under `ctrl`/`inv` bodies (handled on the shell op).
+static bool isExcludedFromTopLevelUnitaryWalk(Operation* op) {
+  if (op->getParentOfType<CtrlOp>()) {
+    return true;
+  }
+  return !llvm::isa<InvOp>(op) && op->getParentOfType<InvOp>();
+}
+
+static void
+collectUnitaryOpsInPreOrder(Operation* root,
+                            llvm::SmallVectorImpl<Operation*>& ops) {
+  root->walk([&](Operation* op) {
+    if (isExcludedFromTopLevelUnitaryWalk(op)) {
+      return;
+    }
+    if (llvm::isa<UnitaryOpInterface>(op)) {
+      ops.push_back(op);
+    }
+  });
+}
+
+static Value emitSingleQubitMatrix(IRRewriter& rewriter, Location loc,
+                                   Value inQubit, const Matrix2x2& matrix,
+                                   const decomposition::EulerBasis basis) {
+  return *decomposition::synthesizeUnitary1QEuler(
+      rewriter, loc, inQubit, matrix, /*runSize=*/0,
+      /*hasNonBasisGate=*/true, basis);
+}
+
+static bool isTwoQubitRunMember(UnitaryOpInterface unitary) {
+  if (!unitary || !unitary.isTwoQubit()) {
+    return false;
+  }
+  if (llvm::isa<BarrierOp, GPhaseOp>(unitary.getOperation())) {
+    return false;
+  }
+  if (isExcludedFromTopLevelUnitaryWalk(unitary.getOperation())) {
+    return false;
+  }
+  Matrix4x4 matrix;
+  return decomposition::assignTwoQubitOpMatrix(unitary.getOperation(), matrix);
+}
+
+static bool isOneQubitWindowMember(UnitaryOpInterface unitary) {
+  if (!unitary || !unitary.isSingleQubit()) {
+    return false;
+  }
+  if (llvm::isa<BarrierOp, GPhaseOp>(unitary.getOperation())) {
+    return false;
+  }
+  if (isExcludedFromTopLevelUnitaryWalk(unitary.getOperation())) {
+    return false;
+  }
+  Matrix2x2 matrix;
+  return unitary.getUnitaryMatrix2x2(matrix);
+}
+
+static UnitaryOpInterface uniqueUnitaryUser(Value wire) {
+  if (!wire.hasOneUse()) {
+    return {};
+  }
+  auto unitary = llvm::dyn_cast<UnitaryOpInterface>(*wire.user_begin());
+  if (!unitary) {
+    return {};
+  }
+  if (unitary.isTwoQubit()) {
+    return isTwoQubitRunMember(unitary) ? unitary : UnitaryOpInterface{};
+  }
+  if (unitary.isSingleQubit()) {
+    return isOneQubitWindowMember(unitary) ? unitary : UnitaryOpInterface{};
+  }
+  return {};
+}
+
+static Operation* twoQubitGateAtEndOfOneQChain(Value wire) {
+  Value cur = wire;
+  while (Operation* def = cur.getDefiningOp()) {
+    auto unitary = llvm::dyn_cast<UnitaryOpInterface>(def);
+    if (!unitary) {
+      return nullptr;
+    }
+    if (unitary.isTwoQubit()) {
+      return isTwoQubitRunMember(unitary) ? def : nullptr;
+    }
+    if (!isOneQubitWindowMember(unitary)) {
+      return nullptr;
+    }
+    cur = unitary.getInputQubit(0);
+  }
+  return nullptr;
+}
+
+static bool feedsFromSameTwoQubitWindow(UnitaryOpInterface op) {
+  const Value in0 = op.getInputQubit(0);
+  const Value in1 = op.getInputQubit(1);
+  if (!in0.hasOneUse() || !in1.hasOneUse()) {
+    return false;
+  }
+  Operation* gate0 = twoQubitGateAtEndOfOneQChain(in0);
+  Operation* gate1 = twoQubitGateAtEndOfOneQChain(in1);
+  return gate0 != nullptr && gate0 == gate1;
+}
+
+static bool isTwoQubitRunStart(UnitaryOpInterface op) {
+  return isTwoQubitRunMember(op) && !feedsFromSameTwoQubitWindow(op);
+}
+
+namespace {
+
+struct FusableTwoQubitRun {
+  llvm::SmallVector<Operation*, 8> ops;
+  Matrix4x4 composed = Matrix4x4::identity();
+  unsigned numTwoQ = 0;
+  bool anyNonNative = false;
+  Value tailA;
+  Value tailB;
+};
+
+struct OneQubitRun {
+  llvm::SmallVector<UnitaryOpInterface, 4> ops;
+};
+
+} // namespace
+
+// Replace when off-menu ops must be lowered, or when resynthesis uses fewer
+// entanglers than the fused window.
+static bool shouldApplyTwoQubitRunReplacement(const FusableTwoQubitRun& run,
+                                              std::uint8_t numBasisUses) {
+  if (run.anyNonNative) {
+    return true;
+  }
+  return numBasisUses < run.numTwoQ;
+}
+
+static void
+absorbTwoQubitIntoRun(FusableTwoQubitRun& run, UnitaryOpInterface op,
+                      const decomposition::NativeProfileSpec& spec) {
+  Matrix4x4 opMatrix;
+  if (!decomposition::assignTwoQubitOpMatrix(op.getOperation(), opMatrix)) {
+    return;
+  }
+  const Value in0 = op.getInputQubit(0);
+  const Value in1 = op.getInputQubit(1);
+  llvm::SmallVector<std::size_t, 2> ids;
+  if (in0 == run.tailA && in1 == run.tailB) {
+    ids = {0, 1};
+    run.tailA = op.getOutputQubit(0);
+    run.tailB = op.getOutputQubit(1);
+  } else if (in0 == run.tailB && in1 == run.tailA) {
+    ids = {1, 0};
+    run.tailA = op.getOutputQubit(1);
+    run.tailB = op.getOutputQubit(0);
+  } else {
+    return;
+  }
+  run.composed = reorderTwoQubitMatrix(opMatrix, ids[0], ids[1]) * run.composed;
+  run.ops.push_back(op.getOperation());
+  ++run.numTwoQ;
+  if (!decomposition::allowsTwoQubitOp(op.getOperation(), spec)) {
+    run.anyNonNative = true;
+  }
+}
+
+static void absorbOneQubitIntoRun(FusableTwoQubitRun& run,
+                                  UnitaryOpInterface op,
+                                  const decomposition::NativeProfileSpec& spec,
+                                  unsigned wireIndex) {
+  Matrix2x2 raw;
+  if (!op.getUnitaryMatrix2x2(raw)) {
+    return;
+  }
+  const auto pad = embedSingleQubitInTwoQubit(raw, wireIndex);
+  run.composed = pad * run.composed;
+  run.ops.push_back(op.getOperation());
+  if (!decomposition::allowsSingleQubitOp(op, spec)) {
+    run.anyNonNative = true;
+  }
+  if (wireIndex == 0) {
+    run.tailA = op.getOutputQubit(0);
+  } else {
+    run.tailB = op.getOutputQubit(0);
+  }
+}
+
+static FusableTwoQubitRun
+scanFusableTwoQubitRun(UnitaryOpInterface head,
+                       const decomposition::NativeProfileSpec& spec) {
+  FusableTwoQubitRun run;
+  run.tailA = head.getOutputQubit(0);
+  run.tailB = head.getOutputQubit(1);
+  run.ops.push_back(head.getOperation());
+  run.numTwoQ = 1;
+  if (!decomposition::assignTwoQubitOpMatrix(head.getOperation(),
+                                             run.composed)) {
+    run.composed = Matrix4x4::identity();
+    run.numTwoQ = 0;
+    run.ops.clear();
+    return run;
+  }
+  if (!decomposition::allowsTwoQubitOp(head.getOperation(), spec)) {
+    run.anyNonNative = true;
+  }
+
+  while (true) {
+    UnitaryOpInterface nextOnA = uniqueUnitaryUser(run.tailA);
+    UnitaryOpInterface nextOnB = uniqueUnitaryUser(run.tailB);
+
+    if (nextOnA && nextOnB &&
+        nextOnA.getOperation() == nextOnB.getOperation() &&
+        nextOnA.isTwoQubit()) {
+      absorbTwoQubitIntoRun(run, nextOnA, spec);
+      continue;
+    }
+
+    if (nextOnA && nextOnB &&
+        nextOnA.getOperation() != nextOnB.getOperation() &&
+        nextOnA.isSingleQubit() && nextOnB.isSingleQubit()) {
+      if (nextOnA->isBeforeInBlock(nextOnB)) {
+        absorbOneQubitIntoRun(run, nextOnA, spec, /*wireIndex=*/0);
+        continue;
+      }
+      absorbOneQubitIntoRun(run, nextOnB, spec, /*wireIndex=*/1);
+      continue;
+    }
+
+    if (nextOnA && nextOnA.isSingleQubit() &&
+        (!nextOnB || nextOnA.getOperation() != nextOnB.getOperation())) {
+      absorbOneQubitIntoRun(run, nextOnA, spec, /*wireIndex=*/0);
+      continue;
+    }
+
+    if (nextOnB && nextOnB.isSingleQubit() &&
+        (!nextOnA || nextOnB.getOperation() != nextOnA.getOperation())) {
+      absorbOneQubitIntoRun(run, nextOnB, spec, /*wireIndex=*/1);
+      continue;
+    }
+
+    break;
+  }
+  return run;
+}
+
+static void eraseFusableTwoQubitRun(PatternRewriter& rewriter,
+                                    const FusableTwoQubitRun& run) {
+  for (Operation* op : llvm::reverse(run.ops)) {
+    rewriter.eraseOp(op);
+  }
+}
+
+static bool maybeFuseRun(IRRewriter& rewriter, OneQubitRun& run,
+                         const decomposition::EulerBasis basis,
+                         const decomposition::NativeProfileSpec& spec) {
+  Matrix2x2 fused = Matrix2x2::identity();
+  for (UnitaryOpInterface u : run.ops) {
+    Matrix2x2 m;
+    if (!u.getUnitaryMatrix2x2(m)) {
+      return false;
+    }
+    fused.premultiplyBy(m);
+  }
+
+  const bool anyNonNative = llvm::any_of(run.ops, [&](UnitaryOpInterface u) {
+    return !decomposition::allowsSingleQubitOp(u, spec);
+  });
+
+  Operation* firstOp = run.ops.front().getOperation();
+  const Value inQubit = run.ops.front().getInputQubit(0);
+  const Value outQubit = run.ops.back().getOutputQubit(0);
+
+  rewriter.setInsertionPoint(firstOp);
+  const auto replacement = decomposition::synthesizeUnitary1QEuler(
+      rewriter, firstOp->getLoc(), inQubit, fused, run.ops.size(), anyNonNative,
+      basis);
+  if (!replacement) {
+    return false;
+  }
+  rewriter.replaceAllUsesWith(outQubit, *replacement);
+  for (auto& op : llvm::reverse(run.ops)) {
+    rewriter.eraseOp(op.getOperation());
+  }
+  return true;
+}
+
+static UnitaryOpInterface fusibleSingleQubitOp(Operation* op) {
+  auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+  if (!unitary || !unitary.isSingleQubit()) {
+    return {};
+  }
+  if (llvm::isa<BarrierOp, GPhaseOp, CtrlOp>(op)) {
+    return {};
+  }
+  if (isExcludedFromTopLevelUnitaryWalk(op)) {
+    return {};
+  }
+  Matrix2x2 matrix;
+  if (!unitary.getUnitaryMatrix2x2(matrix)) {
+    return {};
+  }
+  return unitary;
+}
+
+namespace {
+
+struct FuseTwoQubitWindowPattern
+    : public OpInterfaceRewritePattern<UnitaryOpInterface> {
+  FuseTwoQubitWindowPattern(MLIRContext* ctx,
+                            decomposition::NativeProfileSpec specIn)
+      : OpInterfaceRewritePattern(ctx), spec(std::move(specIn)) {}
+
+  LogicalResult matchAndRewrite(UnitaryOpInterface op,
+                                PatternRewriter& rewriter) const override {
+    if (!isTwoQubitRunStart(op)) {
+      return failure();
+    }
+
+    FusableTwoQubitRun run = scanFusableTwoQubitRun(op, spec);
+    if (run.ops.size() < 2) {
+      return failure();
+    }
+
+    const auto numBasisUses =
+        decomposition::twoQubitEntanglerCount(run.composed, spec);
+    if (!numBasisUses ||
+        !shouldApplyTwoQubitRunReplacement(run, *numBasisUses)) {
+      return failure();
+    }
+
+    Operation* firstOp = run.ops.front();
+    auto firstUnitary = llvm::cast<UnitaryOpInterface>(firstOp);
+    const Value inA = firstUnitary.getInputQubit(0);
+    const Value inB = firstUnitary.getInputQubit(1);
+
+    rewriter.setInsertionPoint(firstOp);
+    Value newA;
+    Value newB;
+    if (failed(decomposition::synthesizeUnitary2QWeyl(
+            rewriter, firstOp->getLoc(), inA, inB, run.composed, spec, newA,
+            newB))) {
+      firstOp->emitError("failed to emit synthesized two-qubit gate sequence");
+      return failure();
+    }
+    rewriter.replaceAllUsesWith(run.tailA, newA);
+    rewriter.replaceAllUsesWith(run.tailB, newB);
+    eraseFusableTwoQubitRun(rewriter, run);
+    return success();
+  }
+
+  decomposition::NativeProfileSpec spec;
+};
+
+} // namespace
+
+static LogicalResult
+fuseTwoQubitUnitaryRuns(Operation* root,
+                        const decomposition::NativeProfileSpec& spec) {
+  RewritePatternSet patterns(root->getContext());
+  patterns.add<FuseTwoQubitWindowPattern>(patterns.getContext(), spec);
+  return applyPatternsGreedily(root, std::move(patterns));
+}
+
+namespace {
+
+struct FuseTwoQubitUnitaryRunsPass
+    : impl::FuseTwoQubitUnitaryRunsBase<FuseTwoQubitUnitaryRunsPass> {
+  FuseTwoQubitUnitaryRunsPass() = default;
+
+  explicit FuseTwoQubitUnitaryRunsPass(FuseTwoQubitUnitaryRunsOptions options)
+      : FuseTwoQubitUnitaryRunsBase(std::move(options)) {}
+
+protected:
+  void runOnOperation() override {
+    if (llvm::StringRef(nativeGates).trim().empty()) {
+      return;
+    }
+    auto specOpt = decomposition::parseNativeSpec(nativeGates);
+    if (!specOpt) {
+      getOperation().emitError()
+          << "unsupported native gate menu (native-gates='" << nativeGates
+          << "')";
+      signalPassFailure();
+      return;
+    }
+    const auto& spec = *specOpt;
+    const decomposition::EulerBasis oneQubitBasis =
+        decomposition::emitterEulerBasis(spec.singleQubitEmitters.front());
+
+    IRRewriter rewriter(&getContext());
+
+    fuseOneQubitRuns(rewriter, spec, oneQubitBasis);
+    if (failed(fuseTwoQubitUnitaryRuns(getOperation(), spec))) {
+      signalPassFailure();
+      return;
+    }
+    constexpr unsigned kMaxSynthesisSweeps = 4;
+    for (unsigned i = 0; i < kMaxSynthesisSweeps; ++i) {
+      if (failed(synthesizeRemainingOps(rewriter, spec, oneQubitBasis))) {
+        signalPassFailure();
+        return;
+      }
+      if (!hasNonNativeSingleQubitOps(spec)) {
+        break;
+      }
+    }
+    if (hasNonNativeSingleQubitOps(spec)) {
+      getOperation().emitError()
+          << "native gate synthesis did not converge within "
+          << kMaxSynthesisSweeps
+          << " sweeps (single-qubit ops remain outside the native menu)";
+      signalPassFailure();
+      return;
+    }
+    fuseOneQubitRuns(rewriter, spec, oneQubitBasis);
+    constexpr unsigned kPostMenuCleanupSweeps = 4;
+    unsigned postMenuSweepsRemaining = kPostMenuCleanupSweeps;
+    while (hasNonNativeMenuOps(spec) && postMenuSweepsRemaining-- > 0) {
+      if (failed(synthesizeRemainingOps(rewriter, spec, oneQubitBasis))) {
+        signalPassFailure();
+        return;
+      }
+      fuseOneQubitRuns(rewriter, spec, oneQubitBasis);
+    }
+    if (hasNonNativeMenuOps(spec)) {
+      getOperation().emitError()
+          << "native gate synthesis: operations remain outside the native menu "
+             "after final cleanup";
+      signalPassFailure();
+      return;
+    }
+  }
+
+  bool hasNonNativeMenuOps(const decomposition::NativeProfileSpec& spec) {
+    const mlir::WalkResult walkResult =
+        getOperation()->walk([&](Operation* op) {
+          if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+            return mlir::WalkResult::advance();
+          }
+          if (isExcludedFromTopLevelUnitaryWalk(op)) {
+            return mlir::WalkResult::advance();
+          }
+          if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+            if (!decomposition::allowsSingleTargetCtrl(ctrl, spec)) {
+              return mlir::WalkResult::interrupt();
+            }
+            return mlir::WalkResult::advance();
+          }
+          auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+          if (!unitary) {
+            return mlir::WalkResult::advance();
+          }
+          if (unitary.isSingleQubit()) {
+            if (!decomposition::allowsSingleQubitOp(unitary, spec)) {
+              return mlir::WalkResult::interrupt();
+            }
+            return mlir::WalkResult::advance();
+          }
+          if (unitary.isTwoQubit()) {
+            if (!decomposition::allowsBareTwoQubitOp(op, spec)) {
+              return mlir::WalkResult::interrupt();
+            }
+            return mlir::WalkResult::advance();
+          }
+          return mlir::WalkResult::interrupt();
+        });
+    return walkResult.wasInterrupted();
+  }
+
+  bool
+  hasNonNativeSingleQubitOps(const decomposition::NativeProfileSpec& spec) {
+    const mlir::WalkResult walkResult =
+        getOperation()->walk([&](Operation* op) {
+          if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+            return mlir::WalkResult::advance();
+          }
+          if (isExcludedFromTopLevelUnitaryWalk(op)) {
+            return mlir::WalkResult::advance();
+          }
+          auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+          if (!unitary || !unitary.isSingleQubit()) {
+            return mlir::WalkResult::advance();
+          }
+          if (!decomposition::allowsSingleQubitOp(unitary, spec)) {
+            return mlir::WalkResult::interrupt();
+          }
+          return mlir::WalkResult::advance();
+        });
+    return walkResult.wasInterrupted();
+  }
+
+private:
+  void fuseOneQubitRuns(IRRewriter& rewriter,
+                        const decomposition::NativeProfileSpec& spec,
+                        const decomposition::EulerBasis basis) {
+    llvm::SmallVector<OneQubitRun> runs;
+    llvm::DenseMap<Operation*, size_t> tailOpToRun;
+
+    // Require single-use tail output so fan-out wires are not fused away.
+    getOperation()->walk([&](Operation* op) {
+      auto unitary = fusibleSingleQubitOp(op);
+      if (!unitary) {
+        return;
+      }
+      Value inQubit = unitary.getInputQubit(0);
+      Operation* defOp = inQubit.getDefiningOp();
+      auto it =
+          (defOp != nullptr) ? tailOpToRun.find(defOp) : tailOpToRun.end();
+      const bool canExtend = it != tailOpToRun.end() && inQubit.hasOneUse();
+      if (canExtend) {
+        const size_t runIdx = it->second;
+        runs[runIdx].ops.push_back(unitary);
+        tailOpToRun.erase(it);
+        tailOpToRun[op] = runIdx;
+      } else {
+        runs.push_back(OneQubitRun{});
+        runs.back().ops.push_back(unitary);
+        tailOpToRun[op] = runs.size() - 1;
+      }
+    });
+
+    for (auto& run : runs) {
+      if (run.ops.size() < 2) {
+        continue;
+      }
+      (void)maybeFuseRun(rewriter, run, basis, spec);
+    }
+  }
+
+  LogicalResult
+  synthesizeRemainingOps(IRRewriter& rewriter,
+                         const decomposition::NativeProfileSpec& spec,
+                         const decomposition::EulerBasis basis) {
+    llvm::SmallVector<Operation*, 32> ops;
+    collectUnitaryOpsInPreOrder(getOperation(), ops);
+    llvm::DenseSet<Operation*> erasedOps;
+
+    for (Operation* op : ops) {
+      if (erasedOps.contains(op)) {
+        continue;
+      }
+      if (isExcludedFromTopLevelUnitaryWalk(op)) {
+        continue;
+      }
+      if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+        continue;
+      }
+      auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+      if (!unitary) {
+        continue;
+      }
+
+      if (unitary.isSingleQubit()) {
+        if (!decomposition::allowsSingleQubitOp(unitary, spec)) {
+          if (failed(rewriteSingleQubit(rewriter, op, unitary, basis))) {
+            return failure();
+          }
+          erasedOps.insert(op);
+        }
+        continue;
+      }
+
+      if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+        const bool wasAlreadyNative =
+            decomposition::allowsSingleTargetCtrl(ctrl, spec);
+        if (failed(rewriteControlled(rewriter, ctrl, spec))) {
+          return failure();
+        }
+        if (!wasAlreadyNative) {
+          erasedOps.insert(op);
+        }
+        continue;
+      }
+
+      if (unitary.isTwoQubit()) {
+        if (failed(rewriteTwoQubit(rewriter, op, unitary, spec))) {
+          return failure();
+        }
+        erasedOps.insert(op);
+        continue;
+      }
+    }
+    return success();
+  }
+
+  static LogicalResult
+  rewriteSingleQubit(IRRewriter& rewriter, Operation* op,
+                     UnitaryOpInterface unitary,
+                     const decomposition::EulerBasis basis) {
+    rewriter.setInsertionPoint(op);
+    const Value in = unitary.getInputQubit(0);
+    Matrix2x2 matrix;
+    if (!unitary.getUnitaryMatrix2x2(matrix)) {
+      op->emitError("single-qubit operation with non-constant parameters is "
+                    "not supported for native synthesis");
+      return failure();
+    }
+    const Value replaced =
+        emitSingleQubitMatrix(rewriter, op->getLoc(), in, matrix, basis);
+    rewriter.replaceOp(op, replaced);
+    return success();
+  }
+
+  static LogicalResult
+  rewriteControlled(IRRewriter& rewriter, CtrlOp ctrl,
+                    const decomposition::NativeProfileSpec& spec) {
+    if (decomposition::allowsSingleTargetCtrl(ctrl, spec)) {
+      return success();
+    }
+    Matrix4x4 matrix;
+    if (!decomposition::assignTwoQubitOpMatrix(ctrl.getOperation(), matrix)) {
+      ctrl.emitError(
+          "native synthesis: cannot build a constant 4x4 matrix for this "
+          "controlled gate (unsupported body or non-constant parameters)");
+      return failure();
+    }
+    rewriter.setInsertionPoint(ctrl);
+    Value out0;
+    Value out1;
+    if (failed(decomposition::synthesizeUnitary2QWeyl(
+            rewriter, ctrl.getLoc(), ctrl.getInputControl(0),
+            ctrl.getInputTarget(0), matrix, spec, out0, out1))) {
+      ctrl.emitError("controlled gate not allowed by selected profile");
+      return failure();
+    }
+    rewriter.replaceOp(ctrl, ValueRange{out0, out1});
+    return success();
+  }
+
+  static LogicalResult
+  rewriteTwoQubit(IRRewriter& rewriter, Operation* op,
+                  UnitaryOpInterface unitary,
+                  const decomposition::NativeProfileSpec& spec) {
+    if (decomposition::allowsBareTwoQubitOp(op, spec)) {
+      return success();
+    }
+    Matrix4x4 matrix;
+    if (!decomposition::assignTwoQubitOpMatrix(op, matrix)) {
+      op->emitError("unsupported two-qubit operation for selected profile");
+      return failure();
+    }
+    rewriter.setInsertionPoint(op);
+    Value out0;
+    Value out1;
+    if (failed(decomposition::synthesizeUnitary2QWeyl(
+            rewriter, op->getLoc(), unitary.getInputQubit(0),
+            unitary.getInputQubit(1), matrix, spec, out0, out1))) {
+      op->emitError("unsupported two-qubit operation for selected profile");
+      return failure();
+    }
+    rewriter.replaceOp(op, ValueRange{out0, out1});
+    return success();
+  }
+};
+
+} // namespace
+
+} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Utils/Matrix.cpp b/mlir/lib/Dialect/QCO/Utils/Matrix.cpp
index 8df45840a3..3588bbdae7 100644
--- a/mlir/lib/Dialect/QCO/Utils/Matrix.cpp
+++ b/mlir/lib/Dialect/QCO/Utils/Matrix.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/QCO/Utils/Matrix.h"
 
 #include <llvm/ADT/ArrayRef.h>
+#include <llvm/Support/ErrorHandling.h>
 #include <mlir/Support/LLVM.h>
 
 #include <algorithm>
@@ -231,6 +232,10 @@ Matrix2x2 Matrix2x2::adjoint() const {
                       std::conj(data[1]), std::conj(data[3]));
 }
 
+Matrix2x2 Matrix2x2::transpose() const {
+  return fromElements(data[0], data[2], data[1], data[3]);
+}
+
 Complex Matrix2x2::trace() const { return data[0] + data[3]; }
 
 Complex Matrix2x2::determinant() const {
@@ -241,6 +246,10 @@ bool Matrix2x2::isApprox(const Matrix2x2& other, const double tol) const {
   return entriesAreApprox(data, other.data, tol);
 }
 
+bool Matrix2x2::isIdentity(const double tol) const {
+  return isApprox(fromElements(1.0, 0.0, 0.0, 1.0), tol);
+}
+
 bool Matrix2x2::assignFrom(const DynamicMatrix& src) {
   return assignFromDynamicImpl<K_ROWS, K_SIZE_AT_COMPILE_TIME>(src, data);
 }
@@ -296,6 +305,16 @@ Matrix4x4 Matrix4x4::adjoint() const {
   return out;
 }
 
+Matrix4x4 Matrix4x4::transpose() const {
+  Matrix4x4 out{};
+  for (std::size_t row = 0; row < K_ROWS; ++row) {
+    for (std::size_t col = 0; col < K_COLS; ++col) {
+      out.data[(col * K_COLS) + row] = data[(row * K_COLS) + col];
+    }
+  }
+  return out;
+}
+
 Complex Matrix4x4::trace() const {
   return data[0] + data[5] + data[10] + data[15];
 }
@@ -321,6 +340,58 @@ bool Matrix4x4::isApprox(const Matrix4x4& other, const double tol) const {
   return entriesAreApprox(data, other.data, tol);
 }
 
+bool Matrix4x4::isIdentity(const double tol) const {
+  Matrix4x4 id{};
+  for (std::size_t i = 0; i < K_ROWS; ++i) {
+    id.data[(i * K_COLS) + i] = 1.0;
+  }
+  return isApprox(id, tol);
+}
+
+std::array<Complex, Matrix4x4::K_ROWS> Matrix4x4::diagonal() const {
+  return {data[0], data[5], data[10], data[15]};
+}
+
+Matrix4x4
+Matrix4x4::fromDiagonal(const std::array<Complex, K_ROWS>& diagonalEntries) {
+  Matrix4x4 out{};
+  for (std::size_t i = 0; i < K_ROWS; ++i) {
+    out.data[(i * K_COLS) + i] = diagonalEntries[i];
+  }
+  return out;
+}
+
+std::array<Complex, Matrix4x4::K_ROWS>
+Matrix4x4::column(const std::size_t col) const {
+  return {data[col], data[K_COLS + col], data[(2 * K_COLS) + col],
+          data[(3 * K_COLS) + col]};
+}
+
+void Matrix4x4::setColumn(const std::size_t col,
+                          const std::array<Complex, K_ROWS>& values) {
+  for (std::size_t row = 0; row < K_ROWS; ++row) {
+    data[(row * K_COLS) + col] = values[row];
+  }
+}
+
+std::array<double, Matrix4x4::K_SIZE_AT_COMPILE_TIME>
+Matrix4x4::realPart() const {
+  std::array<double, K_SIZE_AT_COMPILE_TIME> out{};
+  for (std::size_t i = 0; i < K_SIZE_AT_COMPILE_TIME; ++i) {
+    out[i] = data[i].real();
+  }
+  return out;
+}
+
+std::array<double, Matrix4x4::K_SIZE_AT_COMPILE_TIME>
+Matrix4x4::imagPart() const {
+  std::array<double, K_SIZE_AT_COMPILE_TIME> out{};
+  for (std::size_t i = 0; i < K_SIZE_AT_COMPILE_TIME; ++i) {
+    out[i] = data[i].imag();
+  }
+  return out;
+}
+
 bool Matrix4x4::assignFrom(const DynamicMatrix& src) {
   return assignFromDynamicImpl<K_ROWS, K_SIZE_AT_COMPILE_TIME>(src, data);
 }
@@ -453,4 +524,395 @@ bool DynamicMatrix::isApprox(const DynamicMatrix& other,
   return entriesAreApprox(impl_->data, other.impl_->data, tol);
 }
 
+Complex DynamicMatrix::trace() const {
+  Complex sum{0.0, 0.0};
+  const auto udim = checkedDim(impl_->dim);
+  for (std::size_t i = 0; i < udim; ++i) {
+    sum += impl_->data[(i * udim) + i];
+  }
+  return sum;
+}
+
+DynamicMatrix DynamicMatrix::operator*(const DynamicMatrix& rhs) const {
+  if (impl_->dim != rhs.impl_->dim) {
+    llvm::reportFatalInternalError(
+        "DynamicMatrix multiply requires matching dimensions");
+  }
+  const auto udim = checkedDim(impl_->dim);
+  DynamicMatrix out(impl_->dim);
+  for (std::size_t row = 0; row < udim; ++row) {
+    for (std::size_t col = 0; col < udim; ++col) {
+      Complex sum{0.0, 0.0};
+      for (std::size_t k = 0; k < udim; ++k) {
+        sum +=
+            impl_->data[(row * udim) + k] * rhs.impl_->data[(k * udim) + col];
+      }
+      out.impl_->data[(row * udim) + col] = sum;
+    }
+  }
+  return out;
+}
+
+DynamicMatrix DynamicMatrix::operator*(const Complex& scalar) const {
+  DynamicMatrix out(impl_->dim);
+  for (std::size_t i = 0; i < impl_->data.size(); ++i) {
+    out.impl_->data[i] = impl_->data[i] * scalar;
+  }
+  return out;
+}
+
+bool DynamicMatrix::isIdentity(const double tol) const {
+  const auto udim = checkedDim(impl_->dim);
+  for (std::size_t row = 0; row < udim; ++row) {
+    for (std::size_t col = 0; col < udim; ++col) {
+      const Complex expected =
+          (row == col) ? Complex{1.0, 0.0} : Complex{0.0, 0.0};
+      if (std::abs(impl_->data[(row * udim) + col] - expected) > tol) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+[[nodiscard]] static std::size_t qubitBitAt(const std::size_t stateIndex,
+                                            const std::size_t numQubits,
+                                            const std::size_t qubitIndex) {
+  return (stateIndex >> (numQubits - 1 - qubitIndex)) & 1U;
+}
+
+[[nodiscard]] static bool otherQubitBitsMatch(const std::size_t row,
+                                              const std::size_t col,
+                                              const std::size_t numQubits,
+                                              const std::size_t skipA,
+                                              const std::size_t skipB) {
+  for (std::size_t q = 0; q < numQubits; ++q) {
+    if (q == skipA || q == skipB) {
+      continue;
+    }
+    if (qubitBitAt(row, numQubits, q) != qubitBitAt(col, numQubits, q)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+[[nodiscard]] static bool otherSingleQubitBitsMatch(const std::size_t row,
+                                                    const std::size_t col,
+                                                    const std::size_t numQubits,
+                                                    const std::size_t skip) {
+  for (std::size_t q = 0; q < numQubits; ++q) {
+    if (q == skip) {
+      continue;
+    }
+    if (qubitBitAt(row, numQubits, q) != qubitBitAt(col, numQubits, q)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Matrix2x2 operator*(const Complex& scalar, const Matrix2x2& matrix) {
+  return matrix * scalar;
+}
+
+Matrix4x4 operator*(const Complex& scalar, const Matrix4x4& matrix) {
+  return matrix * scalar;
+}
+
+DynamicMatrix operator*(const Complex& scalar, const DynamicMatrix& matrix) {
+  return matrix * scalar;
+}
+
+Matrix4x4 kron(const Matrix2x2& lhs, const Matrix2x2& rhs) {
+  Matrix4x4 out{};
+  for (std::size_t i = 0; i < Matrix2x2::K_ROWS; ++i) {
+    for (std::size_t j = 0; j < Matrix2x2::K_COLS; ++j) {
+      const Complex a = lhs(i, j);
+      for (std::size_t k = 0; k < Matrix2x2::K_ROWS; ++k) {
+        for (std::size_t l = 0; l < Matrix2x2::K_COLS; ++l) {
+          out((2 * i) + k, (2 * j) + l) = a * rhs(k, l);
+        }
+      }
+    }
+  }
+  return out;
+}
+
+const Matrix4x4& twoQubitSwapMatrix() {
+  static const Matrix4x4 MATRIX = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                                          0, 0, 1, 0, //
+                                                          0, 1, 0, 0, //
+                                                          0, 0, 0, 1);
+  return MATRIX;
+}
+
+Matrix4x4 embedSingleQubitInTwoQubit(const Matrix2x2& matrix,
+                                     const std::size_t qubitIndex) {
+  if (qubitIndex == 0) {
+    return kron(matrix, Matrix2x2::identity());
+  }
+  if (qubitIndex == 1) {
+    return kron(Matrix2x2::identity(), matrix);
+  }
+  llvm::reportFatalInternalError("Invalid qubit index for single-qubit embed");
+}
+
+Matrix4x4 reorderTwoQubitMatrix(const Matrix4x4& matrix,
+                                const std::size_t q0Index,
+                                const std::size_t q1Index) {
+  if (q0Index == 0 && q1Index == 1) {
+    return matrix;
+  }
+  if (q0Index == 1 && q1Index == 0) {
+    const auto& swap = twoQubitSwapMatrix();
+    return swap * matrix * swap;
+  }
+  llvm::reportFatalInternalError("Invalid qubit indices for two-qubit reorder");
+}
+
+DynamicMatrix embedSingleQubitInNqubit(const Matrix2x2& matrix,
+                                       const std::size_t numQubits,
+                                       const std::size_t qubitIndex) {
+  if (qubitIndex >= numQubits) {
+    llvm::reportFatalInternalError(
+        "Invalid qubit index for single-qubit embed");
+  }
+  if (numQubits == 2) {
+    return DynamicMatrix(embedSingleQubitInTwoQubit(matrix, qubitIndex));
+  }
+  const auto dim = static_cast<std::int64_t>(1ULL << numQubits);
+  DynamicMatrix out(dim);
+  const auto udim = static_cast<std::size_t>(dim);
+  for (std::size_t row = 0; row < udim; ++row) {
+    for (std::size_t col = 0; col < udim; ++col) {
+      if (!otherSingleQubitBitsMatch(row, col, numQubits, qubitIndex)) {
+        continue;
+      }
+      const std::size_t rowBit = qubitBitAt(row, numQubits, qubitIndex);
+      const std::size_t colBit = qubitBitAt(col, numQubits, qubitIndex);
+      out(static_cast<std::int64_t>(row), static_cast<std::int64_t>(col)) =
+          matrix(rowBit, colBit);
+    }
+  }
+  return out;
+}
+
+DynamicMatrix embedTwoQubitInNqubit(const Matrix4x4& matrix,
+                                    const std::size_t numQubits,
+                                    const std::size_t q0Index,
+                                    const std::size_t q1Index) {
+  if (q0Index >= numQubits || q1Index >= numQubits || q0Index == q1Index) {
+    llvm::reportFatalInternalError("Invalid qubit indices for two-qubit embed");
+  }
+  if (numQubits == 2) {
+    return DynamicMatrix(reorderTwoQubitMatrix(matrix, q0Index, q1Index));
+  }
+  const auto dim = static_cast<std::int64_t>(1ULL << numQubits);
+  DynamicMatrix out(dim);
+  const auto udim = static_cast<std::size_t>(dim);
+  for (std::size_t row = 0; row < udim; ++row) {
+    for (std::size_t col = 0; col < udim; ++col) {
+      if (!otherQubitBitsMatch(row, col, numQubits, q0Index, q1Index)) {
+        continue;
+      }
+      const std::size_t rowPair = (qubitBitAt(row, numQubits, q0Index) << 1) |
+                                  qubitBitAt(row, numQubits, q1Index);
+      const std::size_t colPair = (qubitBitAt(col, numQubits, q0Index) << 1) |
+                                  qubitBitAt(col, numQubits, q1Index);
+      out(static_cast<std::int64_t>(row), static_cast<std::int64_t>(col)) =
+          matrix(rowPair, colPair);
+    }
+  }
+  return out;
+}
+
+SymmetricEigen4 jacobiSymmetricEigen(const std::array<double, 16>& symmetric) {
+  constexpr std::size_t n = 4;
+  constexpr int maxSweeps = 100;
+
+  std::array<double, 16> a = symmetric;
+  std::array<double, 16> v{};
+  for (std::size_t i = 0; i < n; ++i) {
+    v[(i * n) + i] = 1.0;
+  }
+
+  for (int sweep = 0; sweep < maxSweeps; ++sweep) {
+    double off = 0.0;
+    for (std::size_t p = 0; p < n; ++p) {
+      for (std::size_t q = p + 1; q < n; ++q) {
+        off += a[(p * n) + q] * a[(p * n) + q];
+      }
+    }
+    if (off <= 1e-30) {
+      break;
+    }
+
+    for (std::size_t p = 0; p < n; ++p) {
+      for (std::size_t q = p + 1; q < n; ++q) {
+        const double apq = a[(p * n) + q];
+        if (std::abs(apq) <= 1e-300) {
+          continue;
+        }
+        const double app = a[(p * n) + p];
+        const double aqq = a[(q * n) + q];
+        // Rotation angle that annihilates the (p, q) off-diagonal entry.
+        const double phi = 0.5 * std::atan2(2.0 * apq, aqq - app);
+        const double c = std::cos(phi);
+        const double s = std::sin(phi);
+
+        // Right-multiply by the Givens rotation: columns p and q.
+        for (std::size_t k = 0; k < n; ++k) {
+          const double akp = a[(k * n) + p];
+          const double akq = a[(k * n) + q];
+          a[(k * n) + p] = (c * akp) - (s * akq);
+          a[(k * n) + q] = (s * akp) + (c * akq);
+        }
+        // Left-multiply by the transposed rotation: rows p and q.
+        for (std::size_t k = 0; k < n; ++k) {
+          const double apk = a[(p * n) + k];
+          const double aqk = a[(q * n) + k];
+          a[(p * n) + k] = (c * apk) - (s * aqk);
+          a[(q * n) + k] = (s * apk) + (c * aqk);
+        }
+        // Accumulate the rotation into the eigenvector matrix.
+        for (std::size_t k = 0; k < n; ++k) {
+          const double vkp = v[(k * n) + p];
+          const double vkq = v[(k * n) + q];
+          v[(k * n) + p] = (c * vkp) - (s * vkq);
+          v[(k * n) + q] = (s * vkp) + (c * vkq);
+        }
+      }
+    }
+  }
+
+  std::array<double, 4> evals{a[0], a[5], a[10], a[15]};
+  std::array<std::size_t, 4> order{0, 1, 2, 3};
+  std::ranges::sort(order, [&evals](const std::size_t x, const std::size_t y) {
+    return evals[x] < evals[y];
+  });
+
+  SymmetricEigen4 result;
+  for (std::size_t j = 0; j < n; ++j) {
+    const std::size_t src = order[j];
+    result.eigenvalues[j] = evals[src];
+    for (std::size_t i = 0; i < n; ++i) {
+      result.eigenvectors(i, j) = Complex{v[(i * n) + src], 0.0};
+    }
+  }
+  return result;
+}
+
+double remEuclid(double a, double b) {
+  if (b == 0.0) {
+    llvm::reportFatalInternalError("remEuclid expects non-zero divisor");
+  }
+  const auto r = std::fmod(a, b);
+  return (r < 0.0) ? r + std::abs(b) : r;
+}
+
+double traceToFidelity(const Complex& trace) {
+  const auto traceAbs = std::abs(trace);
+  return (4.0 + (traceAbs * traceAbs)) / 20.0;
+}
+
+Complex globalPhaseFactor(double phase) {
+  return std::exp(Complex{0.0, 1.0} * phase);
+}
+
+bool isUnitaryMatrix(const Matrix2x2& matrix, const double tolerance) {
+  return (matrix.adjoint() * matrix).isIdentity(tolerance);
+}
+
+Matrix2x2 rxMatrix(const double theta) {
+  const auto halfTheta = theta / 2.0;
+  const Complex cos{std::cos(halfTheta), 0.0};
+  const Complex isin{0.0, -std::sin(halfTheta)};
+  return Matrix2x2::fromElements(cos, isin, isin, cos);
+}
+
+Matrix2x2 ryMatrix(const double theta) {
+  const auto halfTheta = theta / 2.0;
+  const Complex cos{std::cos(halfTheta), 0.0};
+  const Complex sin{std::sin(halfTheta), 0.0};
+  return Matrix2x2::fromElements(cos, -sin, sin, cos);
+}
+
+Matrix2x2 rzMatrix(const double theta) {
+  return Matrix2x2::fromElements(
+      Complex{std::cos(theta / 2.0), -std::sin(theta / 2.0)}, 0.0, 0.0,
+      Complex{std::cos(theta / 2.0), std::sin(theta / 2.0)});
+}
+
+const Matrix2x2& iPauliZ() {
+  static const Matrix2x2 MATRIX =
+      Matrix2x2::fromElements(Complex{0.0, 1.0}, 0.0, 0.0, Complex{0.0, -1.0});
+  return MATRIX;
+}
+
+const Matrix2x2& iPauliY() {
+  static const Matrix2x2 MATRIX = Matrix2x2::fromElements(0.0, 1.0, -1.0, 0.0);
+  return MATRIX;
+}
+
+const Matrix2x2& iPauliX() {
+  static const Matrix2x2 MATRIX =
+      Matrix2x2::fromElements(0.0, Complex{0.0, 1.0}, Complex{0.0, 1.0}, 0.0);
+  return MATRIX;
+}
+
+Matrix4x4 rxxMatrix(const double theta) {
+  const auto cosTheta = std::cos(theta / 2.0);
+  const Complex misin{0.0, -std::sin(theta / 2.0)};
+  return Matrix4x4::fromElements(cosTheta, 0.0, 0.0, misin, //
+                                 0.0, cosTheta, misin, 0.0, //
+                                 0.0, misin, cosTheta, 0.0, //
+                                 misin, 0.0, 0.0, cosTheta);
+}
+
+Matrix4x4 ryyMatrix(const double theta) {
+  const auto cosTheta = std::cos(theta / 2.0);
+  const Complex isin{0.0, std::sin(theta / 2.0)};
+  const Complex misin{0.0, -std::sin(theta / 2.0)};
+  return Matrix4x4::fromElements(cosTheta, 0.0, 0.0, isin,  //
+                                 0.0, cosTheta, misin, 0.0, //
+                                 0.0, misin, cosTheta, 0.0, //
+                                 isin, 0.0, 0.0, cosTheta);
+}
+
+Matrix4x4 rzzMatrix(const double theta) {
+  const auto cosTheta = std::cos(theta / 2.0);
+  const auto sinTheta = std::sin(theta / 2.0);
+  const Complex em{cosTheta, -sinTheta};
+  const Complex ep{cosTheta, sinTheta};
+  return Matrix4x4::fromElements(em, 0.0, 0.0, 0.0, //
+                                 0.0, ep, 0.0, 0.0, //
+                                 0.0, 0.0, ep, 0.0, //
+                                 0.0, 0.0, 0.0, em);
+}
+
+const Matrix4x4& twoQubitControlledX01() {
+  static const Matrix4x4 MATRIX = Matrix4x4::fromElements(1.0, 0.0, 0.0, 0.0, //
+                                                          0.0, 1.0, 0.0, 0.0, //
+                                                          0.0, 0.0, 0.0, 1.0, //
+                                                          0.0, 0.0, 1.0, 0.0);
+  return MATRIX;
+}
+
+const Matrix4x4& twoQubitControlledX10() {
+  static const Matrix4x4 MATRIX = Matrix4x4::fromElements(1.0, 0.0, 0.0, 0.0, //
+                                                          0.0, 0.0, 0.0, 1.0, //
+                                                          0.0, 0.0, 1.0, 0.0, //
+                                                          0.0, 1.0, 0.0, 0.0);
+  return MATRIX;
+}
+
+const Matrix4x4& twoQubitControlledZ() {
+  static const Matrix4x4 MATRIX = Matrix4x4::fromElements(1.0, 0.0, 0.0, 0.0, //
+                                                          0.0, 1.0, 0.0, 0.0, //
+                                                          0.0, 0.0, 1.0, 0.0, //
+                                                          0.0, 0.0, 0.0, -1.0);
+  return MATRIX;
+}
+
 } // namespace mlir::qco
diff --git a/mlir/tools/mqt-cc/CMakeLists.txt b/mlir/tools/mqt-cc/CMakeLists.txt
index 022aa279d8..25adecd290 100644
--- a/mlir/tools/mqt-cc/CMakeLists.txt
+++ b/mlir/tools/mqt-cc/CMakeLists.txt
@@ -8,7 +8,18 @@
 
 # Build the compiler driver executable
 add_mlir_tool(mqt-cc mqt-cc.cpp DEPENDS MQTCompilerPipeline SUPPORT_PLUGINS)
-target_link_libraries(mqt-cc PRIVATE MQTCompilerPipeline MLIRParser MLIRSupport MLIRQCTranslation)
+target_link_libraries(
+  mqt-cc
+  PRIVATE MQTCompilerPipeline
+          # Required for parsing .mlir files
+          MLIRParser
+          # Required for file I/O
+          MLIRSupport
+          # Required for OpenQASM parsing
+          MQT::CoreQASM
+          MQT::CoreIR
+          MLIRQCTranslation
+          MLIRMemRefDialect)
 
 mqt_mlir_target_use_project_options(mqt-cc)
 llvm_update_compile_flags(mqt-cc)
diff --git a/mlir/tools/mqt-cc/mqt-cc.cpp b/mlir/tools/mqt-cc/mqt-cc.cpp
index d8fb75daa4..a2f12cc870 100644
--- a/mlir/tools/mqt-cc/mqt-cc.cpp
+++ b/mlir/tools/mqt-cc/mqt-cc.cpp
@@ -88,6 +88,13 @@ static llvm::cl::opt<bool> enableHadamardLifting(
     llvm::cl::desc("Apply Hadamard lifting during optimization"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<std::string> nativeGates(
+    "native-gates",
+    llvm::cl::desc(
+        "Comma-separated native gate menu for the fuse-two-qubit-unitary-runs "
+        "pass"),
+    llvm::cl::value_desc("csv"), llvm::cl::init(""));
+
 /**
  * @brief Load and parse a .qasm file
  */
@@ -179,6 +186,7 @@ int main(int argc, char** argv) {
   config.disableMergeSingleQubitRotationGates =
       disableMergeSingleQubitRotationGates;
   config.enableHadamardLifting = enableHadamardLifting;
+  config.nativeGates = nativeGates.getValue();
 
   // Run the compilation pipeline
   CompilationRecord record;
@@ -199,7 +207,8 @@ int main(int argc, char** argv) {
                  << record.afterQCOConversion << "\n";
     llvm::outs() << "After Initial QCO Canonicalization:\n"
                  << record.afterQCOCanon << "\n";
-    llvm::outs() << "After Optimization:\n" << record.afterOptimization << "\n";
+    llvm::outs() << "After Optimization and Native Gate Synthesis:\n"
+                 << record.afterOptimization << "\n";
     llvm::outs() << "After Final QCO Canonicalization:\n"
                  << record.afterOptimizationCanon << "\n";
     llvm::outs() << "After QCO-to-QC Conversion:\n"
diff --git a/mlir/unittests/Compiler/test_compiler_pipeline.cpp b/mlir/unittests/Compiler/test_compiler_pipeline.cpp
index ddc3e4ce4d..ca222a4eca 100644
--- a/mlir/unittests/Compiler/test_compiler_pipeline.cpp
+++ b/mlir/unittests/Compiler/test_compiler_pipeline.cpp
@@ -15,6 +15,9 @@
 #include "mlir/Dialect/QC/IR/QCDialect.h"
 #include "mlir/Dialect/QC/Translation/TranslateQuantumComputationToQC.h"
 #include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
 #include "mlir/Dialect/QIR/Builder/QIRProgramBuilder.h"
 #include "mlir/Dialect/QTensor/IR/QTensorDialect.h"
 #include "mlir/Support/IRVerification.h"
@@ -24,6 +27,9 @@
 #include "quantum_computation_programs.h"
 
 #include <gtest/gtest.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
 #include <mlir/Dialect/Arith/IR/Arith.h>
 #include <mlir/Dialect/ControlFlow/IR/ControlFlow.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
@@ -34,12 +40,16 @@
 #include <mlir/IR/DialectRegistry.h>
 #include <mlir/IR/MLIRContext.h>
 #include <mlir/IR/OwningOpRef.h>
+#include <mlir/IR/Value.h>
 #include <mlir/IR/Verifier.h>
 #include <mlir/Parser/Parser.h>
+#include <mlir/Support/LogicalResult.h>
 
+#include <cstddef>
 #include <cstdlib>
 #include <iosfwd>
 #include <memory>
+#include <optional>
 #include <string>
 
 namespace mqt::test::compiler {
@@ -692,4 +702,223 @@ INSTANTIATE_TEST_SUITE_P(
                                  nullptr, MQT_NAMED_BUILDER(mlir::qc::ctrlTwo),
                                  MQT_NAMED_BUILDER(mlir::qir::ctrlTwo)}));
 
+namespace {
+
+class CompilerPipelineNativeSynthesisConfigTest : public testing::Test {
+protected:
+  std::unique_ptr<mlir::MLIRContext> context;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+  mlir::QuantumCompilerConfig config;
+
+  void SetUp() override {
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::qc::QCDialect, mlir::qco::QCODialect,
+                    mlir::qtensor::QTensorDialect, mlir::arith::ArithDialect,
+                    mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,
+                    mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
+                    mlir::LLVM::LLVMDialect>();
+    context = std::make_unique<mlir::MLIRContext>();
+    context->appendDialectRegistry(registry);
+    context->loadAllAvailableDialects();
+
+    module = mlir::qc::QCProgramBuilder::build(context.get(),
+                                               mlir::qc::staticQubitsWithOps);
+    ASSERT_TRUE(module);
+
+    config.recordIntermediates = true;
+  }
+
+  [[nodiscard]] mlir::CompilationRecord runPipelineAndExpectSuccess() const {
+    mlir::CompilationRecord record;
+    mlir::QuantumCompilerPipeline pipeline(config);
+    EXPECT_TRUE(pipeline.runPipeline(module.get(), &record).succeeded());
+    return record;
+  }
+
+  void runPipelineAndExpectFailure() const {
+    mlir::CompilationRecord record;
+    mlir::QuantumCompilerPipeline pipeline(config);
+    EXPECT_TRUE(mlir::failed(pipeline.runPipeline(module.get(), &record)));
+  }
+};
+
+} // namespace
+
+using mqt::test::isEquivalentUpToGlobalPhase;
+
+static std::optional<mlir::qco::Matrix4x4>
+computeStaticTwoQubitUnitary(mlir::ModuleOp module) {
+  if (module == nullptr) {
+    return std::nullopt;
+  }
+
+  mlir::qco::Matrix4x4 unitary = mlir::qco::Matrix4x4::identity();
+  llvm::DenseMap<mlir::Value, std::size_t> qubitIds;
+
+  const auto getQubitId = [&](mlir::Value qubit) -> std::optional<std::size_t> {
+    const auto it = qubitIds.find(qubit);
+    if (it == qubitIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (auto func : module.getOps<mlir::func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        if (auto staticOp = llvm::dyn_cast<mlir::qco::StaticOp>(&rawOp)) {
+          const auto index = static_cast<std::size_t>(staticOp.getIndex());
+          if (index >= 2) {
+            return std::nullopt;
+          }
+          qubitIds.try_emplace(staticOp.getResult(), index);
+          continue;
+        }
+
+        if (llvm::isa<mlir::qco::BarrierOp, mlir::qco::GPhaseOp>(&rawOp)) {
+          continue;
+        }
+
+        auto op = llvm::dyn_cast<mlir::qco::UnitaryOpInterface>(&rawOp);
+        if (!op) {
+          continue;
+        }
+
+        if (op.isSingleQubit()) {
+          const auto qid = getQubitId(op.getInputQubit(0));
+          if (!qid) {
+            return std::nullopt;
+          }
+          mlir::qco::Matrix2x2 oneQ;
+          if (!op.getUnitaryMatrix2x2(oneQ)) {
+            return std::nullopt;
+          }
+          unitary = mlir::qco::embedSingleQubitInTwoQubit(oneQ, *qid) * unitary;
+          qubitIds[op.getOutputQubit(0)] = *qid;
+          continue;
+        }
+
+        if (op.isTwoQubit()) {
+          const auto q0 = getQubitId(op.getInputQubit(0));
+          const auto q1 = getQubitId(op.getInputQubit(1));
+          if (!q0 || !q1) {
+            return std::nullopt;
+          }
+          mlir::qco::Matrix4x4 twoQ;
+          if (auto ctrl = llvm::dyn_cast<mlir::qco::CtrlOp>(&rawOp)) {
+            if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+              return std::nullopt;
+            }
+            auto* body = ctrl.getBodyUnitary(0).getOperation();
+            if (llvm::isa<mlir::qco::XOp>(body)) {
+              twoQ = mlir::qco::twoQubitControlledX01();
+            } else if (llvm::isa<mlir::qco::ZOp>(body)) {
+              twoQ = mlir::qco::twoQubitControlledZ();
+            } else {
+              return std::nullopt;
+            }
+          } else if (!op.getUnitaryMatrix4x4(twoQ)) {
+            return std::nullopt;
+          }
+          const llvm::SmallVector<std::size_t, 2> ids{*q0, *q1};
+          unitary =
+              mlir::qco::reorderTwoQubitMatrix(twoQ, ids[0], ids[1]) * unitary;
+          qubitIds[op.getOutputQubit(0)] = *q0;
+          qubitIds[op.getOutputQubit(1)] = *q1;
+          continue;
+        }
+
+        return std::nullopt;
+      }
+    }
+  }
+
+  return unitary;
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       AppliesConfiguredNativeSynthesisProfileInStage5) {
+  config.nativeGates = "x,sx,rz,cx";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterOptimization.find("qco.h"), std::string::npos);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       AppliesConfiguredU3CxNativeSynthesisProfileInStage5) {
+  config.nativeGates = "u,cx";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterOptimization.find("qco.h"), std::string::npos);
+  EXPECT_NE(record.afterOptimization.find("qco.u"), std::string::npos);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       AppliesConfiguredExpandedNativeSynthesisProfileInStage5) {
+  config.nativeGates = "u,rx,rz,cx,cz";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterOptimization.find("qco.h"), std::string::npos);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       RejectsUnderSpecifiedNativeSynthesisMenuInStage5) {
+  config.nativeGates = "cx,cz";
+
+  runPipelineAndExpectFailure();
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       RejectsInvalidNativeGateTokenInStage5) {
+  config.nativeGates = "not-a-gate";
+
+  runPipelineAndExpectFailure();
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       LeavesIRUnchangedWhenNoNativeProfileIsConfigured) {
+  config.nativeGates = "";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterQCOCanon, record.afterOptimization);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       LeavesIRUnchangedWhenNativeGatesIsWhitespaceOnly) {
+  config.nativeGates = "   \t  ";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterQCOCanon, record.afterOptimization);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       NativeSynthesisPreservesUnitaryOnStaticQubits) {
+  config.nativeGates = "x,sx,rz,cx";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  auto preSynth = mlir::parseSourceString<mlir::ModuleOp>(record.afterQCOCanon,
+                                                          context.get());
+  auto postSynth = mlir::parseSourceString<mlir::ModuleOp>(
+      record.afterOptimization, context.get());
+  ASSERT_TRUE(preSynth);
+  ASSERT_TRUE(postSynth);
+
+  const auto preU = computeStaticTwoQubitUnitary(preSynth.get());
+  const auto postU = computeStaticTwoQubitUnitary(postSynth.get());
+  ASSERT_TRUE(preU);
+  ASSERT_TRUE(postU);
+  EXPECT_TRUE(isEquivalentUpToGlobalPhase(*preU, *postU));
+}
+
 } // namespace mqt::test::compiler
diff --git a/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt b/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt
index d59780f461..163412b775 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt
+++ b/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt
@@ -8,4 +8,5 @@
 
 add_subdirectory(Decomposition)
 add_subdirectory(Mapping)
+add_subdirectory(NativeSynthesis)
 add_subdirectory(Optimizations)
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt
index f493bb9e4d..c6160821cd 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt
@@ -7,12 +7,20 @@
 # Licensed under the MIT License
 
 set(target_name mqt-core-mlir-unittest-decomposition)
-add_executable(${target_name} test_euler_decomposition.cpp)
+add_executable(${target_name} test_euler_decomposition.cpp test_weyl_decomposition.cpp)
 
-target_link_libraries(${target_name} PRIVATE GTest::gtest_main MLIRQCOProgramBuilder
-                                             MLIRQCOTransforms)
-target_link_libraries(${target_name} PRIVATE MLIRPass MLIRFuncDialect MLIRArithDialect MLIRIR
-                                             MLIRSupport MLIRQTensorDialect)
+target_link_libraries(
+  ${target_name}
+  PRIVATE GTest::gtest_main
+          MLIRQCOProgramBuilder
+          MLIRQCOTransforms
+          MLIRQCOUtils
+          MLIRPass
+          MLIRFuncDialect
+          MLIRArithDialect
+          MLIRIR
+          MLIRSupport
+          MLIRQTensorDialect)
 
 mqt_mlir_configure_unittest_target(${target_name})
 
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp
index fa9ba1d4ea..8cc15b55f3 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp
@@ -37,7 +37,6 @@
 #include <mlir/Support/WalkResult.h>
 
 #include <array>
-#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <functional>
@@ -104,18 +103,6 @@ class EulerSynthesisExactTest
 // Euler synthesis support
 //===----------------------------------------------------------------------===//
 
-[[nodiscard]] static Matrix2x2 rzMatrix(const double theta) {
-  const auto m00 = std::polar(1.0, -theta / 2.0);
-  const auto m11 = std::polar(1.0, theta / 2.0);
-  return Matrix2x2::fromElements(m00, 0, 0, m11);
-}
-
-[[nodiscard]] static Matrix2x2 ryMatrix(const double theta) {
-  const auto m00 = std::cos(theta / 2.0);
-  const auto m01 = -std::sin(theta / 2.0);
-  return Matrix2x2::fromElements(m00, m01, -m01, m00);
-}
-
 [[nodiscard]] static Matrix2x2 randomUnitaryMatrix(std::mt19937& rng) {
   std::uniform_real_distribution dist(-std::numbers::pi, std::numbers::pi);
   const Matrix2x2 su2 =
@@ -264,6 +251,8 @@ template <typename OpTy>
     return countOps<UOp>(funcOp);
   case ZSXX:
     return countZSXXGates(funcOp);
+  case R:
+    return countOps<ROp>(funcOp);
   }
   return 0;
 }
@@ -472,6 +461,8 @@ TEST(EulerSynthesisTest, RandomReconstructionAllBases) {
     return isa<UOp>(op);
   case ZSXX:
     return isa<RZOp, SXOp, XOp>(op);
+  case R:
+    return isa<ROp>(op);
   }
   return false;
 }
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_weyl_decomposition.cpp b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_weyl_decomposition.cpp
new file mode 100644
index 0000000000..41e82a2726
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_weyl_decomposition.cpp
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "TestCaseUtils.h"
+#include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/NativeProfile.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Weyl.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <gtest/gtest.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/DialectRegistry.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/OwningOpRef.h>
+#include <mlir/IR/Value.h>
+#include <mlir/IR/Verifier.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <random>
+#include <tuple>
+#include <vector>
+
+using namespace mlir;
+using namespace mlir::qco;
+using namespace mlir::qco::decomposition;
+using namespace mqt::test;
+
+using QubitId = std::size_t;
+
+static constexpr double K_SANITY_CHECK_PRECISION = 1e-12;
+
+static const Matrix2x2& hGate() {
+  static const Matrix2x2 MATRIX = Matrix2x2::fromElements(
+      FRAC1_SQRT2, FRAC1_SQRT2, FRAC1_SQRT2, -FRAC1_SQRT2);
+  return MATRIX;
+}
+
+static Matrix4x4 randomUnitary4x4(std::mt19937& rng) {
+  std::normal_distribution<double> normalDist(0.0, 1.0);
+  std::vector<std::vector<std::complex<double>>> columns(
+      4, std::vector<std::complex<double>>(4));
+  for (auto& column : columns) {
+    for (auto& entry : column) {
+      entry = std::complex<double>(normalDist(rng), normalDist(rng));
+    }
+  }
+  for (std::size_t j = 0; j < 4; ++j) {
+    for (std::size_t k = 0; k < j; ++k) {
+      std::complex<double> projection{0.0, 0.0};
+      for (std::size_t i = 0; i < 4; ++i) {
+        projection += std::conj(columns[k][i]) * columns[j][i];
+      }
+      for (std::size_t i = 0; i < 4; ++i) {
+        columns[j][i] -= projection * columns[k][i];
+      }
+    }
+    double norm = 0.0;
+    for (std::size_t i = 0; i < 4; ++i) {
+      norm += std::norm(columns[j][i]);
+    }
+    norm = std::sqrt(norm);
+    for (std::size_t i = 0; i < 4; ++i) {
+      columns[j][i] /= norm;
+    }
+  }
+  const Matrix4x4 unitary = Matrix4x4::fromElements(
+      columns[0][0], columns[1][0], columns[2][0], columns[3][0], columns[0][1],
+      columns[1][1], columns[2][1], columns[3][1], columns[0][2], columns[1][2],
+      columns[2][2], columns[3][2], columns[0][3], columns[1][3], columns[2][3],
+      columns[3][3]);
+  assert((unitary.adjoint() * unitary).isIdentity(1e-12));
+  return unitary;
+}
+
+static Matrix4x4 restoreWeyl(const TwoQubitWeylDecomposition& decomposition) {
+  return kron(decomposition.k1l(), decomposition.k1r()) *
+         decomposition.getCanonicalMatrix() *
+         kron(decomposition.k2l(), decomposition.k2r()) *
+         globalPhaseFactor(decomposition.globalPhase());
+}
+
+static Matrix4x4 restoreBasis(const TwoQubitNativeDecomposition& decomposition,
+                              const Matrix4x4& entangler) {
+  const auto& factors = decomposition.singleQubitFactors;
+  const auto layer = [&](std::size_t i) {
+    return kron(factors[(2 * i) + 1], factors[2 * i]);
+  };
+  Matrix4x4 matrix = layer(0);
+  for (std::uint8_t i = 0; i < decomposition.numBasisUses; ++i) {
+    matrix = entangler * matrix;
+    matrix = layer(static_cast<std::size_t>(i) + 1) * matrix;
+  }
+  return matrix * globalPhaseFactor(decomposition.globalPhase);
+}
+
+static auto productMatrixCases() {
+  return ::testing::Values(
+      []() -> Matrix4x4 { return Matrix4x4::identity(); },
+      []() -> Matrix4x4 { return kron(rzMatrix(1.0), ryMatrix(3.1)); },
+      []() -> Matrix4x4 { return kron(Matrix2x2::identity(), rxMatrix(0.1)); });
+}
+
+static auto entangledMatrixCases() {
+  return ::testing::Values(
+      []() -> Matrix4x4 { return rzzMatrix(2.0); },
+      []() -> Matrix4x4 {
+        return ryyMatrix(1.0) * rzzMatrix(3.0) * rxxMatrix(2.0);
+      },
+      []() -> Matrix4x4 {
+        return TwoQubitWeylDecomposition::getCanonicalMatrix(1.5, -0.2, 0.0) *
+               kron(rxMatrix(1.0), Matrix2x2::identity());
+      },
+      []() -> Matrix4x4 {
+        return kron(rxMatrix(1.0), ryMatrix(1.0)) *
+               TwoQubitWeylDecomposition::getCanonicalMatrix(1.1, 0.2, 3.0) *
+               kron(rxMatrix(1.0), Matrix2x2::identity());
+      },
+      []() -> Matrix4x4 {
+        return kron(hGate(), iPauliZ()) * twoQubitControlledX01() *
+               kron(iPauliX(), iPauliY());
+      });
+}
+
+static auto cxBasisCases() {
+  return ::testing::Values(
+      []() -> Matrix4x4 { return twoQubitControlledX01(); },
+      []() -> Matrix4x4 { return twoQubitControlledX10(); });
+}
+
+static bool extractSingleQubitMatrix(UnitaryOpInterface op, Matrix2x2& out) {
+  if (op.getUnitaryMatrix2x2(out)) {
+    return true;
+  }
+  DynamicMatrix dynamic;
+  if (!op.getUnitaryMatrixDynamic(dynamic) || dynamic.rows() != 2 ||
+      dynamic.cols() != 2) {
+    return false;
+  }
+  out = Matrix2x2::fromElements(dynamic(0, 0), dynamic(0, 1), dynamic(1, 0),
+                                dynamic(1, 1));
+  return true;
+}
+
+static bool extractTwoQubitMatrix(UnitaryOpInterface op, Matrix4x4& out) {
+  if (auto ctrl = llvm::dyn_cast<CtrlOp>(op.getOperation())) {
+    if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+      return false;
+    }
+    Operation* body = ctrl.getBodyUnitary(0).getOperation();
+    if (llvm::isa<XOp>(body)) {
+      out = twoQubitControlledX01();
+      return true;
+    }
+    if (llvm::isa<ZOp>(body)) {
+      out = twoQubitControlledZ();
+      return true;
+    }
+    return false;
+  }
+  return op.getUnitaryMatrix4x4(out);
+}
+
+static std::optional<Matrix4x4>
+computeTwoQubitUnitaryFromFunc(func::FuncOp funcOp) {
+  Matrix4x4 unitary = Matrix4x4::identity();
+  std::complex<double> global{1.0, 0.0};
+  llvm::DenseMap<Value, QubitId> wireIds;
+  wireIds[funcOp.getArgument(0)] = 0;
+  wireIds[funcOp.getArgument(1)] = 1;
+
+  auto wireId = [&](Value qubit) -> std::optional<QubitId> {
+    const auto it = wireIds.find(qubit);
+    if (it == wireIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (Operation& rawOp : funcOp.getBody().front()) {
+    if (llvm::isa<func::ReturnOp>(&rawOp)) {
+      continue;
+    }
+    if (auto gphase = llvm::dyn_cast<GPhaseOp>(&rawOp)) {
+      if (const auto matrix = gphase.getUnitaryMatrix()) {
+        global *= (*matrix)(0, 0);
+      }
+      continue;
+    }
+    auto op = llvm::dyn_cast<UnitaryOpInterface>(&rawOp);
+    if (!op) {
+      continue;
+    }
+
+    if (op.isSingleQubit()) {
+      const auto qIn = op->getOperand(0);
+      const auto qid = wireId(qIn);
+      if (!qid) {
+        return std::nullopt;
+      }
+      Matrix2x2 oneQ;
+      if (!extractSingleQubitMatrix(op, oneQ)) {
+        return std::nullopt;
+      }
+      unitary = embedSingleQubitInTwoQubit(oneQ, *qid) * unitary;
+      wireIds[op->getResult(0)] = *qid;
+      continue;
+    }
+
+    if (op.isTwoQubit()) {
+      const auto q0In = op->getOperand(0);
+      const auto q1In = op->getOperand(1);
+      const auto q0id = wireId(q0In);
+      const auto q1id = wireId(q1In);
+      if (!q0id || !q1id) {
+        return std::nullopt;
+      }
+      Matrix4x4 twoQ;
+      if (!extractTwoQubitMatrix(op, twoQ)) {
+        return std::nullopt;
+      }
+      const llvm::SmallVector<QubitId, 2> ids{*q0id, *q1id};
+      unitary = reorderTwoQubitMatrix(twoQ, ids[0], ids[1]) * unitary;
+      wireIds[op->getResult(0)] = *q0id;
+      wireIds[op->getResult(1)] = *q1id;
+    }
+  }
+
+  return unitary * global;
+}
+
+static func::FuncOp synthesize2QIntoFunc(MLIRContext* ctx,
+                                         const Matrix4x4& target,
+                                         const NativeProfileSpec& spec,
+                                         OwningOpRef<ModuleOp>& moduleOut) {
+  moduleOut = ModuleOp::create(UnknownLoc::get(ctx));
+  OpBuilder builder(ctx);
+  builder.setInsertionPointToStart(moduleOut->getBody());
+
+  const auto qubitTy = QubitType::get(ctx);
+  const auto funcTy =
+      builder.getFunctionType({qubitTy, qubitTy}, {qubitTy, qubitTy});
+  const Location loc = moduleOut->getLoc();
+  auto func = func::FuncOp::create(builder, loc, "main", funcTy);
+  auto* entry = func.addEntryBlock();
+
+  builder.setInsertionPointToStart(entry);
+  Value out0;
+  Value out1;
+  if (failed(synthesizeUnitary2QWeyl(builder, loc, entry->getArgument(0),
+                                     entry->getArgument(1), target, spec, out0,
+                                     out1))) {
+    llvm::report_fatal_error(
+        "synthesizeUnitary2QWeyl failed during test synthesis");
+  }
+  func::ReturnOp::create(builder, loc, ValueRange{out0, out1});
+  return func;
+}
+
+static void expectSynthesized2QMatrix(MLIRContext* ctx, const Matrix4x4& target,
+                                      const NativeProfileSpec& spec) {
+  OwningOpRef<ModuleOp> module;
+  const auto func = synthesize2QIntoFunc(ctx, target, spec, module);
+  ASSERT_TRUE(succeeded(verify(module.get())));
+  const auto actual = computeTwoQubitUnitaryFromFunc(func);
+  ASSERT_TRUE(actual.has_value());
+  EXPECT_TRUE(isEquivalentUpToGlobalPhase(*actual, target));
+}
+
+TEST(DecompositionHelpersTest, MatrixUtilitySanity) {
+  EXPECT_DOUBLE_EQ(remEuclid(-1.0, 3.0), 2.0);
+  EXPECT_DOUBLE_EQ(traceToFidelity(std::complex<double>{3.0, 4.0}),
+                   (4.0 + 25.0) / 20.0);
+  EXPECT_NEAR(std::abs(globalPhaseFactor(1.25)), 1.0, 1e-14);
+  EXPECT_FALSE(isUnitaryMatrix(Matrix2x2::fromElements(2.0, 0.0, 0.0, 2.0)));
+  EXPECT_TRUE(isUnitaryMatrix(Matrix2x2::identity()));
+}
+
+namespace {
+
+struct MlirTestContext {
+  std::unique_ptr<MLIRContext> context;
+
+  void setUp() {
+    DialectRegistry registry;
+    registry.insert<QCODialect, arith::ArithDialect, func::FuncDialect>();
+    context = std::make_unique<MLIRContext>();
+    context->appendDialectRegistry(registry);
+    context->loadAllAvailableDialects();
+  }
+
+  [[nodiscard]] MLIRContext* ctx() const { return context.get(); }
+};
+
+class WeylDecompositionTest : public testing::TestWithParam<Matrix4x4 (*)()> {};
+
+class BasisDecomposerTest : public testing::TestWithParam<
+                                std::tuple<Matrix4x4 (*)(), Matrix4x4 (*)()>> {
+protected:
+  void SetUp() override {
+    basisMatrix = std::get<0>(GetParam())();
+    target = std::get<1>(GetParam())();
+    targetDecomposition = std::make_unique<TwoQubitWeylDecomposition>(
+        TwoQubitWeylDecomposition::create(target, 1.0));
+  }
+
+  Matrix4x4 target;
+  Matrix4x4 basisMatrix;
+  std::unique_ptr<TwoQubitWeylDecomposition> targetDecomposition;
+};
+
+struct WeylSynthesisCase {
+  const char* name;
+  const char* nativeGates;
+  Matrix4x4 (*target)();
+};
+
+class WeylSynthesisTest : public testing::TestWithParam<WeylSynthesisCase> {};
+
+} // namespace
+
+TEST_P(WeylDecompositionTest, ReconstructsWithinRequestedFidelity) {
+  const Matrix4x4 originalMatrix = GetParam()();
+  for (const double fidelity : {1.0, 1.0 - 1e-12}) {
+    const auto decomposition =
+        TwoQubitWeylDecomposition::create(originalMatrix, fidelity);
+    EXPECT_TRUE(restoreWeyl(decomposition).isApprox(originalMatrix));
+  }
+}
+
+TEST(WeylDecompositionStandalone,
+     CnotProducesValidWeylParametersAndUnitaryLocals) {
+  const Matrix4x4 cnot =
+      Matrix4x4::fromElements(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0);
+  const auto decomp = TwoQubitWeylDecomposition::create(cnot, std::nullopt);
+  constexpr double piOver4 = 0.7853981633974483;
+  for (const double angle : {decomp.a(), decomp.b(), decomp.c()}) {
+    EXPECT_GE(angle, -1e-10);
+    EXPECT_LE(angle, piOver4 + 1e-10);
+  }
+  EXPECT_TRUE(isUnitaryMatrix(decomp.k1l()));
+  EXPECT_TRUE(isUnitaryMatrix(decomp.k2l()));
+  EXPECT_TRUE(isUnitaryMatrix(decomp.k1r()));
+  EXPECT_TRUE(isUnitaryMatrix(decomp.k2r()));
+}
+
+TEST(WeylDecompositionStandalone, Random) {
+  std::mt19937 rng{1234567UL};
+  for (int i = 0; i < 5000; ++i) {
+    const Matrix4x4 originalMatrix = randomUnitary4x4(rng);
+    const auto decomposition = TwoQubitWeylDecomposition::create(
+        originalMatrix, std::optional<double>{1.0 - 1e-12});
+    EXPECT_TRUE(restoreWeyl(decomposition)
+                    .isApprox(originalMatrix, K_SANITY_CHECK_PRECISION));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(ProductTwoQubitMatrices, WeylDecompositionTest,
+                         productMatrixCases());
+INSTANTIATE_TEST_SUITE_P(TwoQubitMatrices, WeylDecompositionTest,
+                         entangledMatrixCases());
+
+TEST_P(BasisDecomposerTest, ReconstructsWithinRequestedFidelity) {
+  for (const double fidelity : {1.0, 1.0 - 1e-12}) {
+    const auto decomposer =
+        TwoQubitBasisDecomposer::create(basisMatrix, fidelity);
+    const auto decomposed =
+        decomposer.twoQubitDecompose(*targetDecomposition, std::nullopt);
+    ASSERT_TRUE(decomposed.has_value());
+    EXPECT_TRUE(restoreBasis(*decomposed, basisMatrix).isApprox(target));
+  }
+}
+
+TEST(BasisDecomposerTest, Random) {
+  std::mt19937 rng{123456UL};
+  const llvm::SmallVector<Matrix4x4, 2> basisMatrices{twoQubitControlledX01(),
+                                                      twoQubitControlledX10()};
+  std::uniform_int_distribution<std::size_t> distBasisGate{0, 1};
+
+  for (int i = 0; i < 2000; ++i) {
+    const Matrix4x4 originalMatrix = randomUnitary4x4(rng);
+    const auto targetDecomposition = TwoQubitWeylDecomposition::create(
+        originalMatrix, std::optional<double>{1.0});
+    const Matrix4x4 basisMatrix = basisMatrices[distBasisGate(rng)];
+    const auto decomposer = TwoQubitBasisDecomposer::create(basisMatrix, 1.0);
+    const auto decomposed =
+        decomposer.twoQubitDecompose(targetDecomposition, std::nullopt);
+    ASSERT_TRUE(decomposed.has_value());
+    EXPECT_TRUE(restoreBasis(*decomposed, basisMatrix)
+                    .isApprox(originalMatrix, K_SANITY_CHECK_PRECISION));
+  }
+}
+
+TEST(BasisDecomposerNumBasisTest, ForcesZeroBasisUsesForIdentityTarget) {
+  const Matrix4x4 basis = twoQubitControlledX01();
+  const auto decomposer = TwoQubitBasisDecomposer::create(basis, 1.0);
+  const Matrix4x4 target = Matrix4x4::identity();
+  const auto weyl = TwoQubitWeylDecomposition::create(target, 1.0);
+  const auto decomposed = decomposer.twoQubitDecompose(weyl, std::uint8_t{0});
+  ASSERT_TRUE(decomposed.has_value());
+  EXPECT_EQ(decomposed->numBasisUses, 0);
+  EXPECT_TRUE(restoreBasis(*decomposed, basis).isApprox(target));
+}
+
+INSTANTIATE_TEST_SUITE_P(ProductTwoQubitMatrices, BasisDecomposerTest,
+                         testing::Combine(cxBasisCases(),
+                                          productMatrixCases()));
+INSTANTIATE_TEST_SUITE_P(TwoQubitMatrices, BasisDecomposerTest,
+                         testing::Combine(cxBasisCases(),
+                                          entangledMatrixCases()));
+
+TEST_P(WeylSynthesisTest, PreservesTargetUnitary) {
+  MlirTestContext fx;
+  fx.setUp();
+  const auto spec = parseNativeSpec(GetParam().nativeGates);
+  ASSERT_TRUE(spec);
+  expectSynthesized2QMatrix(fx.ctx(), GetParam().target(), *spec);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Profiles, WeylSynthesisTest,
+    testing::Values(
+        WeylSynthesisCase{"CxGeneric", "u,cx",
+                          [] { return twoQubitControlledX01(); }},
+        WeylSynthesisCase{"ProductGeneric", "u,cx",
+                          [] { return kron(rzMatrix(1.0), ryMatrix(0.3)); }},
+        WeylSynthesisCase{"IbmBasic", "x,sx,rz,cx",
+                          [] {
+                            return kron(hGate(), Matrix2x2::identity()) *
+                                   twoQubitControlledX01() *
+                                   kron(rzMatrix(0.2), ryMatrix(0.1));
+                          }}),
+    [](const testing::TestParamInfo<WeylSynthesisCase>& info) {
+      return info.param.name;
+    });
+
+TEST(WeylSynthesisTest, IdentityRequiresNoEntanglers) {
+  const auto spec = parseNativeSpec("u,cx");
+  ASSERT_TRUE(spec);
+  const auto count = twoQubitEntanglerCount(Matrix4x4::identity(), *spec);
+  ASSERT_TRUE(count.has_value());
+  EXPECT_EQ(*count, 0U);
+}
+
+TEST(WeylSynthesisTest, FailsWithoutEntanglerInSpec) {
+  MlirTestContext fx;
+  fx.setUp();
+  const auto spec = parseNativeSpec("u");
+  ASSERT_TRUE(spec);
+  EXPECT_TRUE(spec->entanglerBases.empty());
+  OpBuilder builder(fx.ctx());
+  const auto qubitTy = QubitType::get(fx.ctx());
+  const auto funcTy =
+      builder.getFunctionType({qubitTy, qubitTy}, {qubitTy, qubitTy});
+  auto func =
+      func::FuncOp::create(builder, UnknownLoc::get(fx.ctx()), "main", funcTy);
+  auto* entry = func.addEntryBlock();
+  builder.setInsertionPointToStart(entry);
+  Value out0;
+  Value out1;
+  EXPECT_TRUE(failed(synthesizeUnitary2QWeyl(
+      builder, func.getLoc(), entry->getArgument(0), entry->getArgument(1),
+      twoQubitControlledX01(), *spec, out0, out1)));
+}
+
+TEST(NativeSpecTest, ParsesAndRejectsMenus) {
+  const auto ibm = parseNativeSpec("x,sx,rz,cx");
+  ASSERT_TRUE(ibm);
+  EXPECT_TRUE(ibm->allowedGates.contains(NativeGateKind::Cx));
+  EXPECT_TRUE(ibm->allowedGates.contains(NativeGateKind::X));
+  EXPECT_FALSE(ibm->allowRzz);
+  EXPECT_FALSE(parseNativeSpec("x,sx,rz,not-a-gate").has_value());
+
+  const auto pMenu = parseNativeSpec("x,sx,p,cx");
+  const auto rzMenu = parseNativeSpec("x,sx,rz,cx");
+  ASSERT_TRUE(pMenu);
+  ASSERT_TRUE(rzMenu);
+  EXPECT_EQ(pMenu->allowedGates, rzMenu->allowedGates);
+
+  const auto cxOnly = parseNativeSpec("u,cx");
+  ASSERT_TRUE(cxOnly);
+  EXPECT_TRUE(llvm::is_contained(cxOnly->entanglerBases, EntanglerBasis::Cx));
+  EXPECT_FALSE(llvm::is_contained(cxOnly->entanglerBases, EntanglerBasis::Cz));
+
+  const auto both = parseNativeSpec("u,cx,cz");
+  ASSERT_TRUE(both);
+  EXPECT_TRUE(llvm::is_contained(both->entanglerBases, EntanglerBasis::Cx));
+  EXPECT_TRUE(llvm::is_contained(both->entanglerBases, EntanglerBasis::Cz));
+
+  const auto generic = parseNativeSpec("u,cx");
+  ASSERT_TRUE(generic);
+  EXPECT_TRUE(generic->allowedGates.contains(NativeGateKind::U));
+  EXPECT_FALSE(generic->allowedGates.contains(NativeGateKind::X));
+}
diff --git a/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/CMakeLists.txt b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/CMakeLists.txt
new file mode 100644
index 0000000000..0fa87a5daf
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+# Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+# All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Licensed under the MIT License
+
+set(target_name mqt-core-mlir-unittest-fuse-two-qubit-unitary-runs)
+add_executable(${target_name} test_fuse_two_qubit_unitary_runs.cpp)
+
+target_link_libraries(
+  ${target_name}
+  PRIVATE MLIRParser
+          GTest::gtest_main
+          MLIRQCPrograms
+          MLIRQCOProgramBuilder
+          MLIRQCOUtils
+          MLIRQCToQCO
+          MLIRQCOTransforms
+          MLIRPass
+          MLIRFuncDialect
+          MLIRArithDialect
+          MLIRIR
+          MLIRSupport
+          LLVMSupport)
+
+mqt_mlir_configure_unittest_target(${target_name})
+
+gtest_discover_tests(${target_name} PROPERTIES LABELS mqt-mlir-unittests DISCOVERY_TIMEOUT 60)
diff --git a/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/test_fuse_two_qubit_unitary_runs.cpp b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/test_fuse_two_qubit_unitary_runs.cpp
new file mode 100644
index 0000000000..ddaa13e035
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/test_fuse_two_qubit_unitary_runs.cpp
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "TestCaseUtils.h"
+#include "mlir/Conversion/QCToQCO/QCToQCO.h"
+#include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Passes.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+#include "qc_programs.h"
+
+#include <gtest/gtest.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/raw_ostream.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/QC/Builder/QCProgramBuilder.h>
+#include <mlir/Dialect/QC/IR/QCDialect.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/DialectRegistry.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/OwningOpRef.h>
+#include <mlir/IR/Value.h>
+#include <mlir/IR/Verifier.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/LLVM.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Support/WalkResult.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+
+using namespace mlir;
+using namespace mlir::qco;
+using namespace mqt::test;
+
+using ProgramFn = void (*)(mlir::qc::QCProgramBuilder&);
+using NativePredicate = bool (*)(OwningOpRef<ModuleOp>&);
+
+template <typename... Allowed1QOps>
+static bool onlyTheseOps(OwningOpRef<ModuleOp>& moduleOp, bool allowCx,
+                         bool allowCz) {
+  bool ok = true;
+  std::ignore = moduleOp->walk([&](UnitaryOpInterface op) {
+    Operation* raw = op.getOperation();
+    if (llvm::isa_and_present<CtrlOp>(raw->getParentOp())) {
+      return WalkResult::advance();
+    }
+    if (llvm::isa<BarrierOp, GPhaseOp>(raw)) {
+      return WalkResult::advance();
+    }
+    if (auto ctrl = llvm::dyn_cast<CtrlOp>(raw)) {
+      if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+        ok = false;
+        return WalkResult::interrupt();
+      }
+      Operation* body = ctrl.getBodyUnitary(0).getOperation();
+      const bool isCx = llvm::isa<XOp>(body);
+      const bool isCz = llvm::isa<ZOp>(body);
+      if ((isCx && allowCx) || (isCz && allowCz)) {
+        return WalkResult::advance();
+      }
+      ok = false;
+      return WalkResult::interrupt();
+    }
+    if (!llvm::isa<Allowed1QOps...>(raw)) {
+      ok = false;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return ok;
+}
+
+static bool onlyIbmBasicCxOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<XOp, SXOp, RZOp, POp>(m, true, false);
+}
+static bool onlyIbmBasicCzOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<XOp, SXOp, RZOp, POp>(m, false, true);
+}
+static bool onlyGenericU3CxOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<UOp>(m, true, false);
+}
+static bool onlyIqmDefaultOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<ROp>(m, false, true);
+}
+static bool onlyIbmFractionalOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<XOp, SXOp, RZOp, POp, RXOp, RZZOp>(m, false, true);
+}
+static bool onlyAxisPairRxRzCxOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<RXOp, RZOp, POp>(m, true, false);
+}
+static bool onlyAxisPairRxRyCxOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<RXOp, RYOp>(m, true, false);
+}
+static bool onlyAxisPairRyRzCzOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<RYOp, RZOp, POp>(m, false, true);
+}
+static bool onlyGenericU3CxOrCzOps(OwningOpRef<ModuleOp>& m) {
+  return onlyTheseOps<UOp>(m, true, true);
+}
+
+static std::optional<Value> unitaryQubitOperand(UnitaryOpInterface op,
+                                                std::size_t index) {
+  if (index >= op.getNumQubits()) {
+    return std::nullopt;
+  }
+  Value v = op->getOperand(index);
+  if (!llvm::isa<QubitType>(v.getType())) {
+    return std::nullopt;
+  }
+  return v;
+}
+
+static std::optional<Value> unitaryQubitResult(UnitaryOpInterface op,
+                                               std::size_t index) {
+  if (index >= op.getNumQubits()) {
+    return std::nullopt;
+  }
+  Value v = op->getResult(index);
+  if (!llvm::isa<QubitType>(v.getType())) {
+    return std::nullopt;
+  }
+  return v;
+}
+
+static bool extractSingleQubitMatrix(UnitaryOpInterface op, Matrix2x2& out) {
+  if (op.getUnitaryMatrix2x2(out)) {
+    return true;
+  }
+  DynamicMatrix dynamic;
+  if (!op.getUnitaryMatrixDynamic(dynamic) || dynamic.rows() != 2 ||
+      dynamic.cols() != 2) {
+    return false;
+  }
+  out = Matrix2x2::fromElements(dynamic(0, 0), dynamic(0, 1), dynamic(1, 0),
+                                dynamic(1, 1));
+  return true;
+}
+
+static bool extractTwoQubitMatrix(UnitaryOpInterface op, Matrix4x4& out) {
+  if (auto ctrl = llvm::dyn_cast<CtrlOp>(op.getOperation())) {
+    if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+      return false;
+    }
+    Operation* body = ctrl.getBodyUnitary(0).getOperation();
+    if (llvm::isa<XOp>(body)) {
+      out = twoQubitControlledX01();
+      return true;
+    }
+    if (llvm::isa<ZOp>(body)) {
+      out = twoQubitControlledZ();
+      return true;
+    }
+    return false;
+  }
+  return op.getUnitaryMatrix4x4(out);
+}
+
+static std::optional<DynamicMatrix>
+computeUnitaryFromQcoModule(const OwningOpRef<ModuleOp>& moduleOp) {
+  ModuleOp module = moduleOp.get();
+  if (!module) {
+    return std::nullopt;
+  }
+
+  llvm::DenseMap<Value, std::size_t> qubitIds;
+  std::size_t nextQubitId = 0;
+  std::size_t numQubits = 0;
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        if (auto staticOp = llvm::dyn_cast<StaticOp>(&rawOp)) {
+          const auto index = static_cast<std::size_t>(staticOp.getIndex());
+          qubitIds.try_emplace(staticOp.getQubit(), index);
+          numQubits = std::max(numQubits, index + 1);
+        } else if (auto alloc = llvm::dyn_cast<AllocOp>(&rawOp)) {
+          qubitIds.try_emplace(alloc.getResult(), nextQubitId++);
+          numQubits = std::max(numQubits, nextQubitId);
+        }
+      }
+    }
+  }
+
+  if (numQubits == 0) {
+    return std::nullopt;
+  }
+
+  DynamicMatrix unitary =
+      DynamicMatrix::identity(static_cast<std::int64_t>(1ULL << numQubits));
+
+  auto getQubitId = [&](Value qubit) -> std::optional<std::size_t> {
+    const auto it = qubitIds.find(qubit);
+    if (it == qubitIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        auto op = llvm::dyn_cast<UnitaryOpInterface>(&rawOp);
+        if (!op) {
+          continue;
+        }
+        if (llvm::isa<BarrierOp, GPhaseOp>(op.getOperation())) {
+          continue;
+        }
+
+        if (op.isSingleQubit()) {
+          const auto qIn = unitaryQubitOperand(op, 0);
+          if (!qIn) {
+            return std::nullopt;
+          }
+          const auto qid = getQubitId(*qIn);
+          if (!qid) {
+            return std::nullopt;
+          }
+          Matrix2x2 oneQ;
+          if (!extractSingleQubitMatrix(op, oneQ)) {
+            return std::nullopt;
+          }
+          unitary = embedSingleQubitInNqubit(oneQ, numQubits, *qid) * unitary;
+          const auto qOut = unitaryQubitResult(op, 0);
+          if (!qOut) {
+            return std::nullopt;
+          }
+          qubitIds[*qOut] = *qid;
+          continue;
+        }
+
+        if (op.isTwoQubit()) {
+          const auto q0In = unitaryQubitOperand(op, 0);
+          const auto q1In = unitaryQubitOperand(op, 1);
+          if (!q0In || !q1In) {
+            return std::nullopt;
+          }
+          const auto q0id = getQubitId(*q0In);
+          const auto q1id = getQubitId(*q1In);
+          if (!q0id || !q1id) {
+            return std::nullopt;
+          }
+          Matrix4x4 twoQ;
+          if (!extractTwoQubitMatrix(op, twoQ)) {
+            return std::nullopt;
+          }
+          unitary =
+              embedTwoQubitInNqubit(twoQ, numQubits, *q0id, *q1id) * unitary;
+          const auto q0Out = unitaryQubitResult(op, 0);
+          const auto q1Out = unitaryQubitResult(op, 1);
+          if (!q0Out || !q1Out) {
+            return std::nullopt;
+          }
+          qubitIds[*q0Out] = *q0id;
+          qubitIds[*q1Out] = *q1id;
+          continue;
+        }
+
+        return std::nullopt;
+      }
+    }
+  }
+
+  return unitary;
+}
+
+namespace {
+
+struct ProfileCase {
+  const char* name;
+  ProgramFn program;
+  const char* nativeGates;
+  NativePredicate isNative;
+  bool checkEquivalence;
+};
+
+struct FusionCase {
+  const char* name;
+  ProgramFn program;
+  const char* nativeGates;
+  std::optional<std::size_t> exactCtrlCount;
+  std::optional<std::size_t> minCtrlCount;
+  bool checkTwoQUnitary;
+};
+
+class FuseTwoQubitUnitaryRunsPassTest : public testing::Test {
+protected:
+  void SetUp() override {
+    DialectRegistry registry;
+    registry.insert<mlir::qc::QCDialect, QCODialect, arith::ArithDialect,
+                    func::FuncDialect, memref::MemRefDialect>();
+    context = std::make_unique<MLIRContext>();
+    context->appendDialectRegistry(registry);
+    context->loadAllAvailableDialects();
+  }
+
+  static void runFusePipeline(OwningOpRef<ModuleOp>& moduleOp,
+                              StringRef nativeGates) {
+    PassManager pm(moduleOp->getContext());
+    pm.addPass(createQCToQCO());
+    pm.addPass(createFuseTwoQubitUnitaryRuns(FuseTwoQubitUnitaryRunsOptions{
+        .nativeGates = nativeGates.str(),
+    }));
+    ASSERT_TRUE(succeeded(pm.run(*moduleOp)));
+  }
+
+  static void runQcToQco(OwningOpRef<ModuleOp>& moduleOp) {
+    PassManager pm(moduleOp->getContext());
+    pm.addPass(createQCToQCO());
+    ASSERT_TRUE(succeeded(pm.run(*moduleOp)));
+  }
+
+  static void runTwoQFuse(OwningOpRef<ModuleOp>& moduleOp,
+                          StringRef nativeGates) {
+    PassManager pm(moduleOp->getContext());
+    pm.addPass(createFuseTwoQubitUnitaryRuns(FuseTwoQubitUnitaryRunsOptions{
+        .nativeGates = nativeGates.str(),
+    }));
+    ASSERT_TRUE(succeeded(pm.run(*moduleOp)));
+  }
+
+  static void expectQcoModulesEquivalent(const OwningOpRef<ModuleOp>& lhs,
+                                         const OwningOpRef<ModuleOp>& rhs) {
+    const auto lhsUnitary = computeUnitaryFromQcoModule(lhs);
+    ASSERT_TRUE(lhsUnitary.has_value());
+    const auto rhsUnitary = computeUnitaryFromQcoModule(rhs);
+    ASSERT_TRUE(rhsUnitary.has_value());
+    EXPECT_TRUE(isEquivalentUpToGlobalPhase(*lhsUnitary, *rhsUnitary));
+  }
+
+  void expectNativeAfterSynthesis(ProgramFn program, StringRef nativeGates,
+                                  NativePredicate isNative) {
+    auto moduleOp = mlir::qc::QCProgramBuilder::build(context.get(), program);
+    runFusePipeline(moduleOp, nativeGates);
+    EXPECT_TRUE(isNative(moduleOp));
+  }
+
+  void expectEquivalentAndNativeAfterSynthesis(ProgramFn program,
+                                               StringRef nativeGates,
+                                               NativePredicate isNative) {
+    auto expected = mlir::qc::QCProgramBuilder::build(context.get(), program);
+    runQcToQco(expected);
+    auto synthesized =
+        mlir::qc::QCProgramBuilder::build(context.get(), program);
+    runFusePipeline(synthesized, nativeGates);
+    EXPECT_TRUE(isNative(synthesized));
+    expectQcoModulesEquivalent(expected, synthesized);
+  }
+
+  void expectSynthesisFailure(ProgramFn program, StringRef nativeGates) {
+    auto moduleOp = mlir::qc::QCProgramBuilder::build(context.get(), program);
+    PassManager pm(moduleOp->getContext());
+    pm.addPass(createQCToQCO());
+    pm.addPass(createFuseTwoQubitUnitaryRuns(FuseTwoQubitUnitaryRunsOptions{
+        .nativeGates = nativeGates.str(),
+    }));
+    EXPECT_TRUE(failed(pm.run(*moduleOp)));
+  }
+
+  void expectTwoQFusePreservesUnitary(ProgramFn program,
+                                      StringRef nativeGates) {
+    auto expected = mlir::qc::QCProgramBuilder::build(context.get(), program);
+    ASSERT_TRUE(expected);
+    runQcToQco(expected);
+    auto fused = mlir::qc::QCProgramBuilder::build(context.get(), program);
+    ASSERT_TRUE(fused);
+    runQcToQco(fused);
+    runTwoQFuse(fused, nativeGates);
+    ASSERT_TRUE(succeeded(verify(*fused)));
+    expectQcoModulesEquivalent(expected, fused);
+  }
+
+  static std::size_t countCtrlOps(const OwningOpRef<ModuleOp>& moduleOp) {
+    std::size_t count = 0;
+    moduleOp.get()->walk([&](CtrlOp) { ++count; });
+    return count;
+  }
+
+  std::unique_ptr<MLIRContext> context;
+};
+
+class FuseTwoQubitProfileTest
+    : public FuseTwoQubitUnitaryRunsPassTest,
+      public testing::WithParamInterface<ProfileCase> {};
+
+class FuseTwoQubitFusionTest : public FuseTwoQubitUnitaryRunsPassTest,
+                               public testing::WithParamInterface<FusionCase> {
+};
+
+} // namespace
+
+static void fusionCxCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.cx(q0, q1);
+}
+
+static void fusionHCxInterleavedTCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.cx(q0, q1);
+  b.t(q1);
+  b.s(q0);
+  b.cx(q0, q1);
+}
+
+static void fusionThreeLineCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  const auto q2 = b.allocQubit();
+  b.cx(q0, q1);
+  b.cx(q1, q2);
+  b.cx(q0, q1);
+}
+
+static void fusionCxRSharedOtherPair(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  const auto q2 = b.allocQubit();
+  b.cx(q0, q1);
+  b.rz(0.17, q1);
+  b.cx(q1, q2);
+}
+
+static void fusionCxBarrierCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.barrier({q0, q1});
+  b.cx(q0, q1);
+}
+
+static void fusionSwapCxPattern(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.cx(q1, q0);
+  b.cx(q0, q1);
+}
+
+static void fusionHRzzSRzz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.rzz(-0.29, q0, q1);
+  b.s(q1);
+  b.rzz(0.17, q0, q1);
+}
+
+static void broadOneQThenCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.x(q0);
+  b.y(q1);
+  b.h(q0);
+  b.sx(q1);
+  b.rx(0.13, q0);
+  b.ry(-0.47, q1);
+  b.rz(0.29, q0);
+  b.cz(q0, q1);
+}
+
+static void zeroAngleThenCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.rx(0.0, q0);
+  b.ry(0.0, q1);
+  b.rz(0.0, q0);
+  b.p(0.0, q1);
+  b.cz(q0, q1);
+}
+
+static void ibmFractionalGateFamilies(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.rx(0.13, q1);
+  b.cx(q0, q1);
+  b.cz(q1, q0);
+  b.swap(q0, q1);
+  b.rzz(-0.33, q0, q1);
+  b.rzx(0.41, q0, q1);
+}
+
+static void hstycxTwoQ(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.s(q0);
+  b.t(q0);
+  b.y(q0);
+  b.cx(q0, q1);
+}
+
+static void cxYOnQ1(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.y(q1);
+}
+
+static void hCxTOnQ1(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q1);
+  b.cx(q0, q1);
+  b.t(q1);
+}
+
+static void xYSXCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.x(q0);
+  b.y(q0);
+  b.sx(q0);
+  b.cz(q0, q1);
+}
+
+static void hYCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.y(q0);
+  b.cx(q0, q1);
+}
+
+static void zCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.z(q0);
+  b.cx(q0, q1);
+}
+
+static void xHCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.x(q0);
+  b.h(q0);
+  b.cz(q0, q1);
+}
+
+static void hq0Yq1CxSq0(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.y(q1);
+  b.cx(q0, q1);
+  b.s(q0);
+}
+
+static void hCxSq1(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.cx(q0, q1);
+  b.s(q1);
+}
+
+static void threeQGhz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  const auto q2 = b.allocQubit();
+  b.h(q0);
+  b.cx(q0, q1);
+  b.cx(q1, q2);
+}
+
+static void determinismSwap(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.swap(q0, q1);
+  b.dealloc(q0);
+  b.dealloc(q1);
+}
+
+TEST_P(FuseTwoQubitProfileTest, SynthesizesToNativeMenu) {
+  const ProfileCase& c = GetParam();
+  if (c.checkEquivalence) {
+    expectEquivalentAndNativeAfterSynthesis(c.program, c.nativeGates,
+                                            c.isNative);
+  } else {
+    expectNativeAfterSynthesis(c.program, c.nativeGates, c.isNative);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Menus, FuseTwoQubitProfileTest,
+    testing::Values(
+        ProfileCase{"SwapIbmBasic", mlir::qc::swap, "x,sx,rz,cx",
+                    onlyIbmBasicCxOps, false},
+        ProfileCase{"SwapGeneric", mlir::qc::swap, "u,cx", onlyGenericU3CxOps,
+                    false},
+        ProfileCase{"SwapIqm", mlir::qc::swap, "r,cz", onlyIqmDefaultOps,
+                    false},
+        ProfileCase{"HstycxIbm", hstycxTwoQ, "x,sx,rz,cx", onlyIbmBasicCxOps,
+                    false},
+        ProfileCase{"CxYIqm", cxYOnQ1, "r,cz", onlyIqmDefaultOps, false},
+        ProfileCase{"BroadOneQIqm", broadOneQThenCz, "r,cz", onlyIqmDefaultOps,
+                    false},
+        ProfileCase{"ZeroAngleRyRzCz", zeroAngleThenCz, "ry,rz,cz",
+                    onlyAxisPairRyRzCzOps, false},
+        ProfileCase{"HCxTIbmCz", hCxTOnQ1, "x,sx,rz,cz", onlyIbmBasicCzOps,
+                    false},
+        ProfileCase{"XYSXCzIqm", xYSXCz, "r,cz", onlyIqmDefaultOps, false},
+        ProfileCase{"IbmFractional", ibmFractionalGateFamilies,
+                    "x,sx,rz,rx,rzz,cz", onlyIbmFractionalOps, false},
+        ProfileCase{"HYCxRxRz", hYCx, "rx,rz,cx", onlyAxisPairRxRzCxOps, false},
+        ProfileCase{"ZCxRxRy", zCx, "rx,ry,cx", onlyAxisPairRxRyCxOps, false},
+        ProfileCase{"Hq0Yq1CxSq0", hq0Yq1CxSq0, "u,cx", onlyGenericU3CxOps,
+                    true},
+        ProfileCase{"XHCzRyRz", xHCz, "ry,rz,cz", onlyAxisPairRyRzCzOps, true},
+        ProfileCase{"HCxSq1MultiEnt", hCxSq1, "u,cx,cz", onlyGenericU3CxOrCzOps,
+                    true}),
+    [](const testing::TestParamInfo<ProfileCase>& info) {
+      return info.param.name;
+    });
+
+TEST_F(FuseTwoQubitUnitaryRunsPassTest, FailsForUnsupportedNativeGateMenu) {
+  expectSynthesisFailure(mlir::qc::h, "not-a-gate");
+}
+
+TEST_F(FuseTwoQubitUnitaryRunsPassTest,
+       FailsForNativeGateMenuWithoutSingleQEmitter) {
+  expectSynthesisFailure(mlir::qc::singleControlledX, "cx,cz");
+}
+
+TEST_F(FuseTwoQubitUnitaryRunsPassTest, FailsForMultiControlledGateStructure) {
+  expectSynthesisFailure(mlir::qc::multipleControlledX, "x,sx,rz,cx");
+}
+
+TEST_F(FuseTwoQubitUnitaryRunsPassTest,
+       CandidateSelectionIsDeterministicAcrossRuns) {
+  auto buildFn = [&] {
+    return mlir::qc::QCProgramBuilder::build(context.get(), determinismSwap);
+  };
+  auto firstModule = buildFn();
+  runFusePipeline(firstModule, "u,cx");
+  auto secondModule = buildFn();
+  runFusePipeline(secondModule, "u,cx");
+
+  std::string first;
+  std::string second;
+  llvm::raw_string_ostream osFirst(first);
+  llvm::raw_string_ostream osSecond(second);
+  firstModule->print(osFirst);
+  secondModule->print(osSecond);
+  EXPECT_EQ(first, second);
+}
+
+TEST_F(FuseTwoQubitUnitaryRunsPassTest, ThreeQubitGhzEquivalentOnCoreProfiles) {
+  const std::array<ProfileCase, 3> profiles{{
+      {.name = "GhzIbm",
+       .program = threeQGhz,
+       .nativeGates = "x,sx,rz,cx",
+       .isNative = onlyIbmBasicCxOps,
+       .checkEquivalence = false},
+      {.name = "GhzGeneric",
+       .program = threeQGhz,
+       .nativeGates = "u,cx",
+       .isNative = onlyGenericU3CxOps,
+       .checkEquivalence = false},
+      {.name = "GhzIqm",
+       .program = threeQGhz,
+       .nativeGates = "r,cz",
+       .isNative = onlyIqmDefaultOps,
+       .checkEquivalence = false},
+  }};
+  for (const ProfileCase& profile : profiles) {
+    auto expected = mlir::qc::QCProgramBuilder::build(context.get(), threeQGhz);
+    runQcToQco(expected);
+    auto synthesized =
+        mlir::qc::QCProgramBuilder::build(context.get(), threeQGhz);
+    runFusePipeline(synthesized, profile.nativeGates);
+    EXPECT_TRUE(profile.isNative(synthesized));
+    expectQcoModulesEquivalent(expected, synthesized);
+  }
+}
+
+TEST_P(FuseTwoQubitFusionTest, WindowFusionBehavior) {
+  const FusionCase& c = GetParam();
+  if (c.checkTwoQUnitary) {
+    expectTwoQFusePreservesUnitary(c.program, c.nativeGates);
+  }
+  auto module = mlir::qc::QCProgramBuilder::build(context.get(), c.program);
+  ASSERT_TRUE(module);
+  runQcToQco(module);
+  runTwoQFuse(module, c.nativeGates);
+  if (c.exactCtrlCount) {
+    EXPECT_EQ(countCtrlOps(module), *c.exactCtrlCount);
+  }
+  if (c.minCtrlCount) {
+    EXPECT_GE(countCtrlOps(module), *c.minCtrlCount);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Windows, FuseTwoQubitFusionTest,
+    testing::Values(FusionCase{"AdjacentCxCancel", fusionCxCx, "u,cx", 0,
+                               std::nullopt, true},
+                    FusionCase{"InterleavedOneQ", fusionHCxInterleavedTCx,
+                               "u,cx", std::nullopt, std::nullopt, true},
+                    FusionCase{"DifferentPairBoundary", fusionThreeLineCx,
+                               "u,cx", std::nullopt, 1, false},
+                    FusionCase{"SharedWireOneQ", fusionCxRSharedOtherPair,
+                               "u,cx", std::nullopt, 2, false},
+                    FusionCase{"BarrierBoundary", fusionCxBarrierCx, "u,cx", 2,
+                               std::nullopt, false},
+                    FusionCase{"SwappedWireOrder", fusionSwapCxPattern, "u,cx",
+                               std::nullopt, std::nullopt, true},
+                    FusionCase{"RzzBlock", fusionHRzzSRzz, "x,sx,rz,rx,rzz,cz",
+                               std::nullopt, std::nullopt, true}),
+    [](const testing::TestParamInfo<FusionCase>& info) {
+      return info.param.name;
+    });
+
+TEST_F(FuseTwoQubitUnitaryRunsPassTest, InvalidNativeGatesFailsPass) {
+  auto module = mlir::qc::QCProgramBuilder::build(context.get(), fusionCxCx);
+  ASSERT_TRUE(module);
+  runQcToQco(module);
+  PassManager pm(module->getContext());
+  pm.addPass(createFuseTwoQubitUnitaryRuns(FuseTwoQubitUnitaryRunsOptions{
+      .nativeGates = "not-a-gate",
+  }));
+  EXPECT_TRUE(failed(pm.run(*module)));
+}
diff --git a/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp b/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp
index afa0792415..421b123147 100644
--- a/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp
+++ b/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp
@@ -12,8 +12,11 @@
 
 #include <gtest/gtest.h>
 
+#include <array>
 #include <cmath>
 #include <complex>
+#include <cstddef>
+#include <random>
 #include <utility>
 
 using namespace mlir::qco;
@@ -303,6 +306,178 @@ TEST(Matrix4x4, AssignFromDynamicMatrix) {
   EXPECT_FALSE(out.assignFrom(DynamicMatrix::identity(2)));
 }
 
+TEST(UnitaryMatrix2x2, TransposeAndIsIdentity) {
+  const Matrix2x2 m = Matrix2x2::fromElements(1, 2i, 3, 4);
+  EXPECT_TRUE(m.transpose().isApprox(Matrix2x2::fromElements(1, 3, 2i, 4)));
+  EXPECT_TRUE(Matrix2x2::identity().isIdentity());
+  EXPECT_FALSE(pauliX().isIdentity());
+}
+
+TEST(UnitaryMatrix4x4, TransposeAndIsIdentity) {
+  Matrix4x4 m = Matrix4x4::identity();
+  m(0, 3) = 2i;
+  m(3, 0) = 5.0;
+  const Matrix4x4 t = m.transpose();
+  EXPECT_EQ(t(3, 0), 2i);
+  EXPECT_EQ(t(0, 3), 5.0);
+  EXPECT_TRUE(Matrix4x4::identity().isIdentity());
+  EXPECT_FALSE(swapMatrix().isIdentity());
+}
+
+TEST(UnitaryMatrix4x4, DiagonalColumnsAndParts) {
+  Matrix4x4 m =
+      Matrix4x4::fromElements(Complex{1, 1}, 0, 0, 0, 0, Complex{2, 2}, 0, 0, 0,
+                              0, Complex{3, 3}, 0, 0, 0, 0, Complex{4, 4});
+  const auto diag = m.diagonal();
+  EXPECT_EQ(diag[0], (Complex{1, 1}));
+  EXPECT_EQ(diag[3], (Complex{4, 4}));
+  EXPECT_TRUE(Matrix4x4::fromDiagonal(diag).isApprox(m));
+
+  const auto col1 = m.column(1);
+  EXPECT_EQ(col1[1], (Complex{2, 2}));
+  Matrix4x4 n = Matrix4x4::identity();
+  n.setColumn(2, {1i, 2i, 3i, 4i});
+  EXPECT_EQ(n(0, 2), 1i);
+  EXPECT_EQ(n(3, 2), 4i);
+
+  const auto re = m.realPart();
+  const auto im = m.imagPart();
+  EXPECT_EQ(re[0], 1.0);
+  EXPECT_EQ(im[0], 1.0);
+  EXPECT_EQ(re[15], 4.0);
+  EXPECT_EQ(im[15], 4.0);
+}
+
+TEST(UnitaryMatrix4x4, KroneckerProduct) {
+  const Matrix2x2 x = pauliX();
+  // X (x) I should swap the high bit.
+  const Matrix4x4 xi = kron(x, Matrix2x2::identity());
+  EXPECT_TRUE(xi.isApprox(Matrix4x4::fromElements(0, 0, 1, 0, // row 0
+                                                  0, 0, 0, 1, // row 1
+                                                  1, 0, 0, 0, // row 2
+                                                  0, 1, 0, 0)));
+  // I (x) X swaps the low bit.
+  const Matrix4x4 ix = kron(Matrix2x2::identity(), x);
+  EXPECT_TRUE(ix.isApprox(Matrix4x4::fromElements(0, 1, 0, 0, // row 0
+                                                  1, 0, 0, 0, // row 1
+                                                  0, 0, 0, 1, // row 2
+                                                  0, 0, 1, 0)));
+}
+
+TEST(UnitaryDynamicMatrix, NQubitEmbedMatchesTwoQubitSpecialization) {
+  const Matrix2x2 x = pauliX();
+  const Matrix4x4 cx = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                               0, 1, 0, 0, //
+                                               0, 0, 0, 1, //
+                                               0, 0, 1, 0);
+  EXPECT_TRUE(embedSingleQubitInNqubit(x, 2, 0).isApprox(
+      embedSingleQubitInTwoQubit(x, 0)));
+  EXPECT_TRUE(embedTwoQubitInNqubit(cx, 2, 0, 1)
+                  .isApprox(reorderTwoQubitMatrix(cx, 0, 1)));
+  const DynamicMatrix cxOn01 = embedTwoQubitInNqubit(cx, 3, 0, 1);
+  const DynamicMatrix cxOn12 = embedTwoQubitInNqubit(cx, 3, 1, 2);
+  EXPECT_EQ(cxOn01.rows(), 8);
+  EXPECT_EQ(cxOn12.rows(), 8);
+  EXPECT_FALSE(cxOn01.isApprox(cxOn12));
+}
+
+TEST(UnitaryDynamicMatrix, MultiplyTraceAndScalar) {
+  const Matrix2x2 x = pauliX();
+  const DynamicMatrix embedded = embedSingleQubitInNqubit(x, 2, 0);
+  EXPECT_FALSE(embedded.isIdentity());
+  const Complex scalar = std::exp(1i * 0.3);
+  EXPECT_TRUE((scalar * embedded).isApprox(embedded * scalar));
+  const DynamicMatrix product = embedded * embedded;
+  EXPECT_TRUE(product.isIdentity());
+  EXPECT_NEAR(product.trace().real(), 4.0, 1e-12);
+}
+
+TEST(UnitaryMatrix2x2, ScalarLeftMultiply) {
+  const Matrix2x2 x = pauliX();
+  const Complex scalar = std::exp(1i * 0.5);
+  EXPECT_TRUE((scalar * x).isApprox(x * scalar));
+}
+
+TEST(UnitaryMatrix4x4, ScalarLeftMultiply) {
+  const Matrix4x4 swap = swapMatrix();
+  const Complex scalar = std::exp(1i * 0.25);
+  EXPECT_TRUE((scalar * swap).isApprox(swap * scalar));
+}
+
+TEST(GateMatrixFactories, RemEuclidAndControlledGates) {
+  EXPECT_DOUBLE_EQ(remEuclid(-1.0, 3.0), 2.0);
+  EXPECT_TRUE(twoQubitControlledX01().isApprox(
+      Matrix4x4::fromElements(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0)));
+  EXPECT_TRUE(twoQubitControlledX10().isApprox(
+      reorderTwoQubitMatrix(twoQubitControlledX01(), 1, 0)));
+  EXPECT_TRUE(rxMatrix(0.0).isIdentity());
+  EXPECT_TRUE((iPauliX() * iPauliX()).isApprox(-1.0 * Matrix2x2::identity()));
+}
+
+TEST(JacobiEigensolver, DiagonalMatrix) {
+  std::array<double, 16> a{};
+  a[0] = 3.0;
+  a[5] = 1.0;
+  a[10] = 4.0;
+  a[15] = 2.0;
+  const SymmetricEigen4 result = jacobiSymmetricEigen(a);
+  EXPECT_NEAR(result.eigenvalues[0], 1.0, 1e-12);
+  EXPECT_NEAR(result.eigenvalues[1], 2.0, 1e-12);
+  EXPECT_NEAR(result.eigenvalues[2], 3.0, 1e-12);
+  EXPECT_NEAR(result.eigenvalues[3], 4.0, 1e-12);
+}
+
+TEST(JacobiEigensolver, ReconstructsRandomSymmetric) {
+  std::mt19937 rng(0xC0FFEE);
+  std::uniform_real_distribution<double> dist(-2.0, 2.0);
+  for (int trial = 0; trial < 50; ++trial) {
+    std::array<double, 16> a{};
+    for (std::size_t i = 0; i < 4; ++i) {
+      for (std::size_t j = i; j < 4; ++j) {
+        const double value = dist(rng);
+        a[(i * 4) + j] = value;
+        a[(j * 4) + i] = value;
+      }
+    }
+    const SymmetricEigen4 result = jacobiSymmetricEigen(a);
+
+    // Eigenvalues are ascending.
+    for (std::size_t i = 0; i + 1 < 4; ++i) {
+      EXPECT_LE(result.eigenvalues[i], result.eigenvalues[i + 1] + 1e-12);
+    }
+
+    // Eigenvectors are orthonormal: V^T V == I.
+    const Matrix4x4& v = result.eigenvectors;
+    EXPECT_TRUE((v.transpose() * v).isIdentity(1e-9));
+
+    // Reconstruction: V D V^T == A.
+    const Matrix4x4 d =
+        Matrix4x4::fromDiagonal({result.eigenvalues[0], result.eigenvalues[1],
+                                 result.eigenvalues[2], result.eigenvalues[3]});
+    const Matrix4x4 reconstructed = v * d * v.transpose();
+    Matrix4x4 original{};
+    for (std::size_t k = 0; k < 16; ++k) {
+      original(k / 4, k % 4) = a[k];
+    }
+    EXPECT_TRUE(reconstructed.isApprox(original, 1e-9));
+  }
+}
+
+TEST(JacobiEigensolver, HandlesDegenerateSpectrum) {
+  // A scalar multiple of the identity: every vector is an eigenvector, but the
+  // returned basis must still be orthonormal.
+  std::array<double, 16> a{};
+  for (std::size_t i = 0; i < 4; ++i) {
+    a[(i * 4) + i] = 2.5;
+  }
+  const SymmetricEigen4 result = jacobiSymmetricEigen(a);
+  for (const double value : result.eigenvalues) {
+    EXPECT_NEAR(value, 2.5, 1e-12);
+  }
+  const Matrix4x4& v = result.eigenvectors;
+  EXPECT_TRUE((v.transpose() * v).isIdentity(1e-9));
+}
+
 TEST(DynamicMatrix, IsApproxOverloads) {
   const Matrix1x1 phase = Matrix1x1::fromElements(Complex{0.25, 0.5});
   const Matrix2x2 x = pauliX();
diff --git a/mlir/unittests/TestCaseUtils.h b/mlir/unittests/TestCaseUtils.h
index 570c86c87f..7603fda8be 100644
--- a/mlir/unittests/TestCaseUtils.h
+++ b/mlir/unittests/TestCaseUtils.h
@@ -18,6 +18,8 @@
 #include <llvm/Support/raw_ostream.h>
 #include <mlir/IR/BuiltinOps.h>
 
+#include <cmath>
+#include <complex> // NOLINT(misc-include-cleaner)
 #include <cstddef>
 #include <cstdlib>
 #include <string>
@@ -26,6 +28,26 @@
 
 namespace mqt::test {
 
+/**
+ * Check whether two unitary matrices are equal up to a single unit-modulus
+ * global phase factor.
+ *
+ * The comparison is symmetric and numerically stable in the sense that a near
+ * zero overlap (``|trace(rhs^H * lhs)| <= atol``) is treated as "not
+ * equivalent" to avoid division by a tiny number.
+ */
+template <typename Matrix>
+[[nodiscard]] bool isEquivalentUpToGlobalPhase(const Matrix& lhs,
+                                               const Matrix& rhs,
+                                               double atol = 1e-10) {
+  const auto overlap = (rhs.adjoint() * lhs).trace();
+  if (std::abs(overlap) <= atol) {
+    return false;
+  }
+  const auto factor = overlap / std::abs(overlap);
+  return lhs.isApprox(factor * rhs, atol);
+}
+
 template <typename BuilderT> struct NamedBuilder {
   const char* name = nullptr;
   void (*fn)(BuilderT&) = nullptr;