MetaX-MACA · Five-HZ · Jul 25, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/NOTICE b/NOTICE
@@ -53,7 +53,10 @@ The following files may have been Modified by MetaX Integrated Circuits (Shangha
     modified: tests/python/all-platform-minimal-test/test_runtime_ndarray.py
     modified: tests/python/codegen/test_gpu_codegen_allreduce.py
     modified: tests/python/codegen/test_target_codegen.py
+    modified: tests/python/codegen/test_target_codegen_blob.py
     modified: tests/python/codegen/test_target_codegen_cuda.py
+    modified: tests/python/codegen/test_target_codegen_device.py
+    modified: tests/python/codegen/test_target_codegen_extern.py
     modified: tests/python/codegen/test_target_codegen_gpu_common.py
     modified: tests/python/testing/test_tvm_testing_features.py
 Modification copyright 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd.
@@ -78,6 +81,7 @@ The following files are newly added by MetaX Integrated Circuits (Shanghai) Co.,
     added: src/target/source/codegen_maca.h
     added: src/target/source/intrin_rule_maca.cc
     added: src/target/source/literal/maca_half_t.h
+    added: tests/python/codegen/test_target_codegen_maca.py
     added: tests/scripts/notice.py
 
 ---------------------------------------------------

diff --git a/src/target/source/literal/maca_half_t.h b/src/target/source/literal/maca_half_t.h
@@ -488,10 +488,7 @@ __host__ __device__ half4 make_half4(__half x, __half y, __half z, __half w) {
     stream << R"(
 using maca_bfloat164 = half4_bfloat164<maca_bfloat16, maca_bfloat162>;
 __host__ __device__ maca_bfloat164 make_maca_bfloat164(maca_bfloat16 x, maca_bfloat16 y, maca_bfloat16 z, maca_bfloat16 w) {
-    return maca_bfloat164(x, y, z, w);
-}
-__host__ __device__ maca_bfloat162 make_maca_bfloat162(maca_bfloat16 x, maca_bfloat16 y) {
-    return maca_bfloat162(x, y);
+    return maca_bfloat164{x, y, z, w};
 }
 )";
     if (enable_fp8) {

diff --git a/tests/python/codegen/test_target_codegen.py b/tests/python/codegen/test_target_codegen.py
@@ -70,7 +70,7 @@ def func(a: T.handle, b: T.handle):
             tvm.compile(func)
 
 
-@tvm.testing.parametrize_targets("cuda", "opencl", "metal", "rocm", "vulkan -from_device=0")
+@tvm.testing.parametrize_targets("cuda", "opencl", "metal", "rocm", "vulkan -from_device=0", "maca")
 def test_buffer_load_predicate_not_supported_gpu(target):
     @T.prim_func
     def func(a: T.handle, b: T.handle):

diff --git a/tests/python/codegen/test_target_codegen_blob.py b/tests/python/codegen/test_target_codegen_blob.py
@@ -94,6 +94,78 @@ def popen_check():
     worker.recv()
 
 
+@tvm.testing.uses_gpu
+def test_maca_multi_lib():
+    # test combining two system lib together
+    # each contains a fatbin component in maca
+    dev = tvm.maca(0)
+    for device in ["llvm", "maca"]:
+        if not tvm.testing.device_enabled(device):
+            print("skip because %s is not enabled..." % device)
+            return
+
+    @tvm.script.ir_module
+    class ModA:
+        I.module_attrs({"system_lib_prefix": "modA_"})
+
+        @T.prim_func
+        def my_inplace_update(x: T.Buffer((12), "float32")) -> None:
+            T.func_attr({"global_symbol": "modA_my_inplace_update"})
+            for bx in T.thread_binding(T.int64(1), thread="blockIdx.x"):
+                for tx in T.thread_binding(T.int64(12), thread="threadIdx.x"):
+                    x[tx] = x[tx] + 1
+
+    @tvm.script.ir_module
+    class ModB:
+        I.module_attrs({"system_lib_prefix": "modB_"})
+
+        @T.prim_func
+        def my_inplace_update(x: T.Buffer((12), "float32")) -> None:
+            T.func_attr({"global_symbol": "modB_my_inplace_update"})
+            for bx in T.thread_binding(T.int64(1), thread="blockIdx.x"):
+                for tx in T.thread_binding(T.int64(12), thread="threadIdx.x"):
+                    x[tx] = x[tx] + 2
+
+    temp = utils.tempdir()
+    target = tvm.target.Target("maca", host="llvm")
+    libA = tvm.compile(ModA, target=target)
+    libB = tvm.compile(ModB, target=target)
+
+    pathA = temp.relpath("libA.tar")
+    pathB = temp.relpath("libB.tar")
+    pathAll = temp.relpath("libAll.a")
+
+    path_dso = temp.relpath("mylib.so")
+    libA.export_library(pathA, fcompile=tar.tar)
+    libB.export_library(pathB, fcompile=tar.tar)
+    cc.create_staticlib(pathAll, [pathA, pathB])
+    # package two static libs together
+    cc.create_shared(path_dso, ["-Wl,--whole-archive", pathAll, "-Wl,--no-whole-archive"])
+
+    def popen_check():
+        # Load dll, will trigger system library registration
+        ctypes.CDLL(path_dso)
+        # Load the system wide library
+        dev = tvm.maca()
+        a_np = np.random.uniform(size=12).astype("float32")
+        a_nd = tvm.nd.array(a_np, dev)
+        b_nd = tvm.nd.array(a_np, dev)
+        syslibA = tvm.runtime.system_lib("modA_")
+        syslibB = tvm.runtime.system_lib("modB_")
+        # reload same lib twice
+        syslibA = tvm.runtime.system_lib("modA_")
+        syslibA["my_inplace_update"](a_nd)
+        syslibB["my_inplace_update"](b_nd)
+        np.testing.assert_equal(a_nd.numpy(), a_np + 1)
+        np.testing.assert_equal(b_nd.numpy(), a_np + 2)
+
+    # system lib should be loaded in different process
+    worker = popen_pool.PopenWorker()
+    worker.send(popen_check)
+    worker.recv()
+
+
 if __name__ == "__main__":
     test_synthetic()
     test_cuda_multilib()
+    test_maca_multilib()
diff --git a/tests/python/codegen/test_target_codegen_device.py b/tests/python/codegen/test_target_codegen_device.py
@@ -56,6 +56,7 @@ def check_target(device):
 
     check_target("cuda")
     check_target("vulkan -from_device=0")
+    check_target("maca")
 
 
 @tvm.testing.requires_gpu
@@ -108,6 +109,7 @@ def check_target(device, host):
     check_target("nvptx", host="llvm")
     check_target("vulkan", host="llvm")
     check_target("rocm", host="llvm")
+    check_target("maca", host="llvm")
 
 
 if __name__ == "__main__":

diff --git a/tests/python/codegen/test_target_codegen_extern.py b/tests/python/codegen/test_target_codegen_extern.py
@@ -66,8 +66,8 @@ def extern_generator_gpu(ins, outs):
     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
-        mod = mod_gpu if target in ["opencl", "cuda"] else mod_cpu
-        C = C_gpu if target in ["opencl", "cuda"] else C_cpu
+        mod = mod_gpu if target in ["opencl", "cuda", "maca"] else mod_cpu
+        C = C_gpu if target in ["opencl", "cuda", "maca"] else C_cpu
         # build and invoke the kernel.
         f = tvm.compile(mod, target=target)
         dev = tvm.device(target, 0)
@@ -81,6 +81,7 @@ def check_target(target):
     check_target("llvm")
     check_target("opencl")
     check_target("cuda")
+    check_target("maca")
 
 
 def test_pack_buffer_simple():