import freetensor as ft
import torch
device = ft.GPU(0)
target = device.target()
host = ft.CPU()
h = torch.randint(0, 4096, (4096, ), dtype=torch.int64).cuda(0)
t = torch.randint(0, 4096, (4096, ), dtype=torch.int64).cuda(0)
r = torch.randint(0, 4096, (4096, ), dtype=torch.int64).cuda(0)
eemb = torch.rand(93773, 512).cuda(0)
remb = torch.rand(51, 512).cuda(0)
proj = torch.rand(51, 512, 512).cuda(0)
res = torch.rand(4096, 512).cuda(0)
batch_size = h.shape[0]
dim = eemb.shape[1]
enode = eemb.shape[0]
rnode = remb.shape[0]
def transr():
def eval(func, print_code=False, time=False):
func = ft.lower(func, target)
if print_code:
print(func, flush=True)
code = ft.codegen(func, target)
if print_code:
print(code, flush=True)
driver = ft.build_binary(code, device)
res = torch.zeros(batch_size,).cuda(0)
head = ft.Array(h)
tail = ft.Array(t)
relation = ft.Array(r)
entemb = ft.Array(eemb)
relemb = ft.Array(remb)
pemb = ft.Array(proj)
res = ft.Array(res)
driver.set_args(heads=head, tails=tail, relations=relation, entemb=entemb, relemb=relemb, pemb=pemb, result=res)
if time:
t1, _ = driver.time()
print("time: %s ms" % t1)
else:
driver.run()
B_np = res.torch()
return B_np
@ft.transform
def score_func(heads, tails, relations, entemb, relemb, pemb, result):
heads: ft.Var[(batch_size, ), "int64", "input", "gpu/global"]
tails: ft.Var[(batch_size, ), "int64", "input", "gpu/global"]
relations: ft.Var[(batch_size, ), "int64", "input", "gpu/global"]
entemb: ft.Var[(enode, dim, ), "float32", "input", "gpu/global"]
relemb: ft.Var[(enode, dim, ), "float32", "input", "gpu/global"]
pemb: ft.Var[(enode, dim, dim, ), "float32", "input", "gpu/global"]
result: ft.Var[(batch_size, ), "float32", "output", "gpu/global"]
inter: ft.Var[(batch_size, dim,), "float32", "cache", "gpu/global"]
# inter = ft.empty((batch_size, dim), "float32")
#! label: bx
for bb in range(batch_size):
#! label: ty
for dd in range(dim):
#! label: tx
for kk in range(dim):
inter[bb, dd] += (entemb[heads[bb], kk] - entemb[tails[bb], kk]) * pemb[relations[bb], kk, dd]
result[bb] += ft.abs(inter[bb, dd] + relemb[relations[bb], dd])
s = ft.Schedule(score_func)
s.parallelize("bx", "blockIdx.x")
s.parallelize("ty", "threadIdx.y")
s.parallelize("tx", "threadIdx.x")
func = s.func()
result = eval(func, True, True)
transr()
I followed your test code on: https://github.com/roastduck/FreeTensor/blob/master/test/70.program/test_gpu_conv2d.py
I'm not sure if 'cache' is the correct keyword for this case, but a CUDA error was found:
ptxas warning : Value of threads per SM for entry kernel0 is out of range. .minnctapersm and .maxntid will be ignored
CUDA error in file '/root/.freetensor/o17vag/run.cu' in line 73 : invalid argument.
Traceback (most recent call last):
File "/data/not_backed_up/lihhu/FreeTensor_experiments/TransR_scheduler.py", line 87, in
transr()
File "/data/not_backed_up/lihhu/FreeTensor_experiments/TransR_scheduler.py", line 84, in transr
result = eval(func, True, True)
File "/data/not_backed_up/lihhu/FreeTensor_experiments/TransR_scheduler.py", line 50, in eval
t1, _ = driver.time()
RuntimeError: cuda error
Here is my code:
Can you help me to fix this issue? Thank you so much!