tile-ai · CokeDong · Jan 21, 2026 · Jan 22, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "3rdparty/composable_kernel"]
 	path = 3rdparty/composable_kernel
 	url = https://github.com/ROCm/composable_kernel
+[submodule "3rdparty/DLCompiler"]
+	path = 3rdparty/DLCompiler
+	url = https://github.com/DeepLink-org/DLCompiler.git
diff --git a/3rdparty/DLCompiler b/3rdparty/DLCompiler
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -170,6 +170,8 @@ if(NOT TILELANG_BACKEND_USER_SELECTED)
     set(USE_METAL ON)
   elseif($ENV{USE_ROCM})
     set(USE_ROCM ON)
+  elseif($ENV{USE_COMMONIR})
+    set(USE_COMMONIR ON)
   else()
     if($ENV{USE_CUDA})
       set(USE_CUDA ON)
@@ -183,6 +185,14 @@ if(NOT TILELANG_BACKEND_USER_SELECTED)
   endif()
 endif()
 
+if(USE_COMMONIR)
+  file(GLOB TILE_LANG_COMMONIR_SRCS
+    src/target/codegen_commonir.cc
+    src/target/rt_mod_commonir.cc
+  )
+  list(APPEND TILE_LANG_SRCS ${TILE_LANG_COMMONIR_SRCS})
+endif()
+
 if(USE_METAL)
   file(GLOB TILE_LANG_METAL_SRCS
     src/target/rt_mod_metal.cc

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Tile Language (**tile-lang**) is a concise domain-specific language designed to
 <img src=./images/MatmulExample.png />
 
 ## Latest News
+- 01/29/2025 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
- 01/29/2025 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
+- 01/29/2026 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
- 01/29/2025 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
+- 01/29/2026 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
 - 12/18/2025 🚀: Added [CuTeDSL backend](https://github.com/tile-ai/tilelang/pull/1421) support, enabling compilation to NVIDIA CUTLASS CuTe DSL! Join us in building and optimizing this exciting new backend: [Issue #1454](https://github.com/tile-ai/tilelang/issues/1454).
 - 12/17/2025 🔬: Integrated [Z3 theorem prover](https://github.com/tile-ai/tilelang/pull/1367) into TVM Arith Analyzer, bringing SMT-based symbolic reasoning for enhanced optimizations and automatic correctness verification!
 - 10/31/2025 🔧: Migrated to [apache-tvm-ffi](https://github.com/tile-ai/tilelang/pull/1108), significantly reducing CPU overhead!

diff --git a/examples/commonir/add_vector.py b/examples/commonir/add_vector.py
@@ -0,0 +1,54 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+import os
+
+import tilelang
+import tilelang.language as T
+
+import torch
+
+dtype = "float32"
+seq_len = 1024
+
+
+def vec_add(N, block_N, dtype="float32"):
+    n_num = N // block_N
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((N), dtype),
+            B: T.Tensor((N), dtype),
+            C: T.Tensor((N), dtype),
+    ):
+        with T.Kernel(n_num, 1) as (by, bx):
+            start_y1 = by * block_N
+            start_y = start_y1 + bx
+            for (local_y) in T.Parallel(block_N):
+                y = start_y + local_y
+                C[y] = A[y] + B[y]
+
+    return main
+
+
+def test_vec_add():
+    func = vec_add(seq_len, seq_len // 4)
+    compiled_kernel = tilelang.compile(func)
+
+    v1 = torch.randn(size=[seq_len], dtype=eval("torch." + dtype)).npu()
+    v2 = torch.randn(size=[seq_len], dtype=eval("torch." + dtype)).npu()
+    v3 = torch.zeros(size=[seq_len], dtype=eval("torch." + dtype)).npu()
+
+    y_ref = v1 + v2
+    compiled_kernel(v1, v2, v3)
+
+    # print(y_ref)
+    # print(v3)
+
+    print(f'The maximum difference between torch and Tilellang is '
+          f'{torch.max(torch.abs(y_ref - v3))}')
+
+    torch.testing.assert_close(v3, y_ref, atol=1e-2, rtol=0)
+
+
+if __name__ == "__main__":
+    test_vec_add()
diff --git a/examples/commonir/add_vector_profiler.py b/examples/commonir/add_vector_profiler.py
@@ -0,0 +1,52 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+import os
+
+import tilelang
+import tilelang.language as T
+from functools import partial
+
+import torch
+import torch_npu
+import time
+import numpy as np
+from typing import Callable, Optional, Union, List
+
+
+dtype = "float32"
+seq_len = 1024
+
+def vec_add(N, block_N, dtype="float32"):
+    n_num = N // block_N
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((N), dtype),
+            B: T.Tensor((N), dtype),
+            C: T.Tensor((N), dtype),
+    ):
+        with T.Kernel(n_num, 1) as (by, bx):
+            start_y1 = by * block_N
+            start_y = start_y1 + bx
+            for (local_y) in T.Parallel(block_N):
+                y = start_y + local_y
+                C[y] = A[y] + B[y]
+
+    return main
+
+def ref_program(v1, v2):
+    return v1 + v2
+
+def test_vec_add():
+    func = vec_add(seq_len, seq_len // 4)
+    compiled_kernel = tilelang.compile(func, out_idx=[2])
+
+    profiler = compiled_kernel.get_profiler()
+    profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
+    latency = profiler.do_bench(ref_program, warmup=500)
+    latency2 = profiler.do_bench(warmup=500)
+    print(f"⏱ latency base is {latency}")
+    print(f"⏱ latency is {latency2}")
+
+if __name__ == "__main__":
+    test_vec_add()
diff --git a/examples/commonir/gemm.py b/examples/commonir/gemm.py
@@ -0,0 +1,52 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+
+import tilelang
+import tilelang.language as T
+
+import torch
+import torch_npu
+device = torch.npu.current_device()
-device = torch.npu.current_device()
+import torch
+import torch_npu
+
+dtype = torch.float16
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    ...
+
+def main():
+    device = torch.npu.current_device()
+    func = matmul(1024, 1024, 1024, 128, 128, 32)
-device = torch.npu.current_device()
+import torch
+import torch_npu
+
+dtype = torch.float16
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    ...
+
+def main():
+    device = torch.npu.current_device()
+    func = matmul(1024, 1024, 1024, 128, 128, 32)
+dtype = torch.float16
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+
+    @T.prim_func
+    def gemm(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+def main():
+    func = matmul(1024, 1024, 1024, 128, 128, 32)
+    kernel = tilelang.compile(func, target='commonir')
+    SIZEALL = 1024
+
+    torch.manual_seed(0)
+    a = torch.rand((SIZEALL, SIZEALL), dtype=dtype, device=device) - 0.5
+    b = torch.rand((SIZEALL, SIZEALL), dtype=dtype, device=device) - 0.5
+    result = torch.zeros((SIZEALL, SIZEALL), dtype=dtype, device=device)
+
+    kernel(a, b, result)
+    golden = a @ b
+    # print(f"result is {result}, golden is {golden}")
+    torch.testing.assert_close(result, golden, atol=1e-2, rtol=1e-2)
+
+if __name__ == "__main__":
+    main()