Skip to content

How to define intermediate result when using schedules? #502

@hulihan-start

Description

@hulihan-start

I followed your test code on: https://github.com/roastduck/FreeTensor/blob/master/test/70.program/test_gpu_conv2d.py
I'm not sure if 'cache' is the correct keyword for this case, but a CUDA error was found:
ptxas warning : Value of threads per SM for entry kernel0 is out of range. .minnctapersm and .maxntid will be ignored
CUDA error in file '/root/.freetensor/o17vag/run.cu' in line 73 : invalid argument.
Traceback (most recent call last):
File "/data/not_backed_up/lihhu/FreeTensor_experiments/TransR_scheduler.py", line 87, in
transr()
File "/data/not_backed_up/lihhu/FreeTensor_experiments/TransR_scheduler.py", line 84, in transr
result = eval(func, True, True)
File "/data/not_backed_up/lihhu/FreeTensor_experiments/TransR_scheduler.py", line 50, in eval
t1, _ = driver.time()
RuntimeError: cuda error

Here is my code:

import freetensor as ft
import torch
device = ft.GPU(0)
target = device.target()
host = ft.CPU()

h = torch.randint(0, 4096, (4096, ), dtype=torch.int64).cuda(0)
t = torch.randint(0, 4096, (4096, ), dtype=torch.int64).cuda(0)
r = torch.randint(0, 4096, (4096, ), dtype=torch.int64).cuda(0)
eemb = torch.rand(93773, 512).cuda(0)
remb = torch.rand(51, 512).cuda(0)
proj = torch.rand(51, 512, 512).cuda(0)
res = torch.rand(4096, 512).cuda(0)

batch_size = h.shape[0]
dim = eemb.shape[1]
enode = eemb.shape[0]
rnode = remb.shape[0]

def transr():
    def eval(func, print_code=False, time=False):
        func = ft.lower(func, target)
        if print_code:
            print(func, flush=True)
        code = ft.codegen(func, target)
        if print_code:
            print(code, flush=True)
        driver = ft.build_binary(code, device)
        res = torch.zeros(batch_size,).cuda(0)
        
        head = ft.Array(h)
        tail = ft.Array(t)
        relation = ft.Array(r)
        entemb = ft.Array(eemb)
        relemb = ft.Array(remb)
        pemb = ft.Array(proj)
        res = ft.Array(res)
        
        driver.set_args(heads=head, tails=tail, relations=relation, entemb=entemb, relemb=relemb, pemb=pemb, result=res)
        if time:
            t1, _ = driver.time()
            print("time: %s ms" % t1)
        else:
            driver.run()
        B_np = res.torch()
        return B_np
    
    @ft.transform
    def score_func(heads, tails, relations, entemb, relemb, pemb, result):
        heads: ft.Var[(batch_size, ), "int64", "input", "gpu/global"]
        tails: ft.Var[(batch_size, ), "int64", "input", "gpu/global"]
        relations: ft.Var[(batch_size, ), "int64", "input", "gpu/global"]
        entemb: ft.Var[(enode, dim, ), "float32", "input", "gpu/global"]
        relemb: ft.Var[(enode, dim, ), "float32", "input", "gpu/global"]
        pemb: ft.Var[(enode, dim, dim, ), "float32", "input", "gpu/global"]
        result: ft.Var[(batch_size, ), "float32", "output", "gpu/global"]
        inter: ft.Var[(batch_size, dim,), "float32", "cache", "gpu/global"]

        # inter = ft.empty((batch_size, dim), "float32")

        #! label: bx
        for bb in range(batch_size):
            #! label: ty
            for dd in range(dim):
                #! label: tx
                for kk in range(dim):
                    inter[bb, dd] += (entemb[heads[bb], kk] - entemb[tails[bb], kk]) * pemb[relations[bb], kk, dd]
                result[bb] += ft.abs(inter[bb, dd] + relemb[relations[bb], dd])

    s = ft.Schedule(score_func)
    s.parallelize("bx", "blockIdx.x")
    s.parallelize("ty", "threadIdx.y")
    s.parallelize("tx", "threadIdx.x")
    func = s.func()
    result = eval(func, True, True)


transr()

Can you help me to fix this issue? Thank you so much!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions