Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "3rdparty/composable_kernel"]
path = 3rdparty/composable_kernel
url = https://github.com/ROCm/composable_kernel
[submodule "3rdparty/DLCompiler"]
path = 3rdparty/DLCompiler
url = https://github.com/DeepLink-org/DLCompiler.git
1 change: 1 addition & 0 deletions 3rdparty/DLCompiler
Submodule DLCompiler added at 51ef7c
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ if(NOT TILELANG_BACKEND_USER_SELECTED)
set(USE_METAL ON)
elseif($ENV{USE_ROCM})
set(USE_ROCM ON)
elseif($ENV{USE_COMMONIR})
set(USE_COMMONIR ON)
else()
if($ENV{USE_CUDA})
set(USE_CUDA ON)
Expand All @@ -183,6 +185,14 @@ if(NOT TILELANG_BACKEND_USER_SELECTED)
endif()
endif()

if(USE_COMMONIR)
file(GLOB TILE_LANG_COMMONIR_SRCS
src/target/codegen_commonir.cc
src/target/rt_mod_commonir.cc
)
list(APPEND TILE_LANG_SRCS ${TILE_LANG_COMMONIR_SRCS})
endif()

if(USE_METAL)
file(GLOB TILE_LANG_METAL_SRCS
src/target/rt_mod_metal.cc
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Tile Language (**tile-lang**) is a concise domain-specific language designed to
<img src=./images/MatmulExample.png />

## Latest News
- 01/29/2025 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Date year appears incorrect.

The entry is dated 01/29/2025, but given the current date (January 2026) and the reverse-chronological ordering of the news section, this should likely be 01/29/2026. With the current date, this entry would be out of order—it should appear much lower in the list, after entries from later in 2025.

📝 Suggested fix
-- 01/29/2025 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
+- 01/29/2026 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
- 01/29/2025 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
- 01/29/2026 🚀: Added CommonIR support, enabling compilation on wider domestic GPU through [DLCompiler](https://github.com/DeepLink-org/DLCompiler) integration.
🤖 Prompt for AI Agents
In `@README.md` at line 16, Update the dated news entry that currently reads
"01/29/2025 🚀: Added CommonIR support..." to the correct year "01/29/2026" and
ensure the News/Changelog block remains in reverse-chronological order;
specifically locate the "01/29/2025" string in README.md and change it to
"01/29/2026" (and if needed reorder adjacent entries so later 2025/2026 items
remain in proper descending date order).

- 12/18/2025 🚀: Added [CuTeDSL backend](https://github.com/tile-ai/tilelang/pull/1421) support, enabling compilation to NVIDIA CUTLASS CuTe DSL! Join us in building and optimizing this exciting new backend: [Issue #1454](https://github.com/tile-ai/tilelang/issues/1454).
- 12/17/2025 🔬: Integrated [Z3 theorem prover](https://github.com/tile-ai/tilelang/pull/1367) into TVM Arith Analyzer, bringing SMT-based symbolic reasoning for enhanced optimizations and automatic correctness verification!
- 10/31/2025 🔧: Migrated to [apache-tvm-ffi](https://github.com/tile-ai/tilelang/pull/1108), significantly reducing CPU overhead!
Expand Down
54 changes: 54 additions & 0 deletions examples/commonir/add_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) Tile-AI Corporation.
# Licensed under the MIT License.
import os

import tilelang
import tilelang.language as T

import torch

dtype = "float32"
seq_len = 1024


def vec_add(N, block_N, dtype="float32"):
n_num = N // block_N

@T.prim_func
def main(
A: T.Tensor((N), dtype),
B: T.Tensor((N), dtype),
C: T.Tensor((N), dtype),
):
with T.Kernel(n_num, 1) as (by, bx):
start_y1 = by * block_N
start_y = start_y1 + bx
for (local_y) in T.Parallel(block_N):
y = start_y + local_y
C[y] = A[y] + B[y]

return main


def test_vec_add():
func = vec_add(seq_len, seq_len // 4)
compiled_kernel = tilelang.compile(func)

v1 = torch.randn(size=[seq_len], dtype=eval("torch." + dtype)).npu()
v2 = torch.randn(size=[seq_len], dtype=eval("torch." + dtype)).npu()
v3 = torch.zeros(size=[seq_len], dtype=eval("torch." + dtype)).npu()

y_ref = v1 + v2
compiled_kernel(v1, v2, v3)

# print(y_ref)
# print(v3)

print(f'The maximum difference between torch and Tilellang is '
f'{torch.max(torch.abs(y_ref - v3))}')

torch.testing.assert_close(v3, y_ref, atol=1e-2, rtol=0)


if __name__ == "__main__":
test_vec_add()
52 changes: 52 additions & 0 deletions examples/commonir/add_vector_profiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) Tile-AI Corporation.
# Licensed under the MIT License.
import os

import tilelang
import tilelang.language as T
from functools import partial

import torch
import torch_npu
import time
import numpy as np
from typing import Callable, Optional, Union, List


dtype = "float32"
seq_len = 1024

def vec_add(N, block_N, dtype="float32"):
n_num = N // block_N

@T.prim_func
def main(
A: T.Tensor((N), dtype),
B: T.Tensor((N), dtype),
C: T.Tensor((N), dtype),
):
with T.Kernel(n_num, 1) as (by, bx):
start_y1 = by * block_N
start_y = start_y1 + bx
for (local_y) in T.Parallel(block_N):
y = start_y + local_y
C[y] = A[y] + B[y]

return main

def ref_program(v1, v2):
return v1 + v2

def test_vec_add():
func = vec_add(seq_len, seq_len // 4)
compiled_kernel = tilelang.compile(func, out_idx=[2])

profiler = compiled_kernel.get_profiler()
profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
latency = profiler.do_bench(ref_program, warmup=500)
latency2 = profiler.do_bench(warmup=500)
print(f"⏱ latency base is {latency}")
print(f"⏱ latency is {latency2}")

if __name__ == "__main__":
test_vec_add()
52 changes: 52 additions & 0 deletions examples/commonir/gemm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) Tile-AI Corporation.
# Licensed under the MIT License.

import tilelang
import tilelang.language as T

import torch
import torch_npu
device = torch.npu.current_device()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Module-level NPU device access may cause import failure.

torch.npu.current_device() is called at module load time, which will raise an error if NPU is not available or torch_npu is not properly initialized. Consider moving this inside main() to allow the module to be imported without an active NPU context.

🛡️ Proposed fix
 import torch
 import torch_npu
-device = torch.npu.current_device()
 dtype = torch.float16

 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     ...

 def main():
+    device = torch.npu.current_device()
     func = matmul(1024, 1024, 1024, 128, 128, 32)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
device = torch.npu.current_device()
import torch
import torch_npu
dtype = torch.float16
def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
...
def main():
device = torch.npu.current_device()
func = matmul(1024, 1024, 1024, 128, 128, 32)
🤖 Prompt for AI Agents
In `@examples/commonir/gemm.py` at line 9, The module-level call to
torch.npu.current_device() (stored in the device variable) can fail at import if
NPU isn't available; move the device detection inside main() (or another runtime
entrypoint) and replace the module-level device assignment with deferred lookup
so imports succeed without an active NPU context; update any references that
used the module-level device variable to retrieve the device inside main() or
pass it down to functions that need it.

dtype = torch.float16

def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):

@T.prim_func
def gemm(
A: T.Tensor((M, K), dtype),
B: T.Tensor((K, N), dtype),
C: T.Tensor((M, N), dtype),
):
with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
A_shared = T.alloc_shared((block_M, block_K), dtype)
B_shared = T.alloc_shared((block_K, block_N), dtype)
C_local = T.alloc_fragment((block_M, block_N), accum_dtype)

T.clear(C_local)
for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
T.copy(A[by * block_M, k * block_K], A_shared)
T.copy(B[k * block_K, bx * block_N], B_shared)
T.gemm(A_shared, B_shared, C_local)

T.copy(C_local, C[by * block_M, bx * block_N])

return gemm


def main():
func = matmul(1024, 1024, 1024, 128, 128, 32)
kernel = tilelang.compile(func, target='commonir')
SIZEALL = 1024

torch.manual_seed(0)
a = torch.rand((SIZEALL, SIZEALL), dtype=dtype, device=device) - 0.5
b = torch.rand((SIZEALL, SIZEALL), dtype=dtype, device=device) - 0.5
result = torch.zeros((SIZEALL, SIZEALL), dtype=dtype, device=device)

kernel(a, b, result)
golden = a @ b
# print(f"result is {result}, golden is {golden}")
torch.testing.assert_close(result, golden, atol=1e-2, rtol=1e-2)

if __name__ == "__main__":
main()
Loading