Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 59 additions & 39 deletions src/ptx.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,65 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
if job.config.kernel
# work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
entry = lower_byval(job, mod, entry)

# emit kernel property annotations into the module. these have to be in
# place before optimization runs: LLVM's NVPTX target machine registers a
# PipelineStart EP callback that schedules NVVMIntrRangePass, which calls
# `getMaxNTID` on every function. That populates a module-keyed
# `AnnotationCache` entry (empty, because `nvvm.annotations` isn't there
# yet), and subsequent lookups by the asm printer hit the stale empty
# entry instead of re-reading the metadata.
annotations = Metadata[entry]

## kernel metadata
append!(annotations, [MDString("kernel"),
ConstantInt(Int32(1))])

## expected CTA sizes
if job.config.target.minthreads !== nothing
bounds = ntuple(i -> i <= length(job.config.target.minthreads) ?
job.config.target.minthreads[i] : 1, 3)
for (bound, name) in zip(bounds, (:x, :y, :z))
append!(annotations, [MDString("reqntid$name"),
ConstantInt(Int32(bound))])
end
if LLVM.version() >= v"21"
push!(function_attributes(entry),
StringAttribute("nvvm.reqntid", join(bounds, ",")))
end
end
if job.config.target.maxthreads !== nothing
bounds = ntuple(i -> i <= length(job.config.target.maxthreads) ?
job.config.target.maxthreads[i] : 1, 3)
for (bound, name) in zip(bounds, (:x, :y, :z))
append!(annotations, [MDString("maxntid$name"),
ConstantInt(Int32(bound))])
end
if LLVM.version() >= v"21"
push!(function_attributes(entry),
StringAttribute("nvvm.maxntid", join(bounds, ",")))
end
end

if job.config.target.blocks_per_sm !== nothing
append!(annotations, [MDString("minctasm"),
ConstantInt(Int32(job.config.target.blocks_per_sm))])
if LLVM.version() >= v"21"
push!(function_attributes(entry),
StringAttribute("nvvm.minctasm", string(job.config.target.blocks_per_sm)))
end
end

if job.config.target.maxregs !== nothing
append!(annotations, [MDString("maxnreg"),
ConstantInt(Int32(job.config.target.maxregs))])
if LLVM.version() >= v"21"
push!(function_attributes(entry),
StringAttribute("nvvm.maxnreg", string(job.config.target.maxregs)))
end
end

push!(metadata(mod)["nvvm.annotations"], MDNode(annotations))
end

# we emit properties (of the device and ptx isa) as private global constants,
Expand Down Expand Up @@ -227,45 +286,6 @@ function finish_ir!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
end
end

if job.config.kernel
# add metadata annotations for the assembler to the module

# property annotations
annotations = Metadata[entry]

## kernel metadata
append!(annotations, [MDString("kernel"),
ConstantInt(Int32(1))])

## expected CTA sizes
if job.config.target.minthreads !== nothing
for (dim, name) in enumerate([:x, :y, :z])
bound = dim <= length(job.config.target.minthreads) ? job.config.target.minthreads[dim] : 1
append!(annotations, [MDString("reqntid$name"),
ConstantInt(Int32(bound))])
end
end
if job.config.target.maxthreads !== nothing
for (dim, name) in enumerate([:x, :y, :z])
bound = dim <= length(job.config.target.maxthreads) ? job.config.target.maxthreads[dim] : 1
append!(annotations, [MDString("maxntid$name"),
ConstantInt(Int32(bound))])
end
end

if job.config.target.blocks_per_sm !== nothing
append!(annotations, [MDString("minctasm"),
ConstantInt(Int32(job.config.target.blocks_per_sm))])
end

if job.config.target.maxregs !== nothing
append!(annotations, [MDString("maxnreg"),
ConstantInt(Int32(job.config.target.maxregs))])
end

push!(metadata(mod)["nvvm.annotations"], MDNode(annotations))
end

return entry
end

Expand Down
5 changes: 4 additions & 1 deletion test/ptx.jl
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,13 @@ if :NVPTX in LLVM.backends()
end
end

# the assembler emits `call.uni` and the callee name on the same line in
# LLVM 21+, but on separate lines on older releases.
@test @filecheck begin
@check_label ".visible .func {{(julia|j)_parent[0-9_]*}}"
@check "call.uni"
@check_next "{{(julia|j)_child_}}"
@check_same cond=(LLVM.version() >= v"21") "{{(julia|j)_child_}}"
@check_next cond=(LLVM.version() < v"21") "{{(julia|j)_child_}}"
PTX.code_native(mod.parent, Tuple{Int64})
end
end
Expand Down
80 changes: 51 additions & 29 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,39 +1,61 @@
using ParallelTestRunner
import GPUCompiler, LLVM

const init_code = quote
using GPUCompiler, LLVM
using SPIRV_LLVM_Backend_jll, SPIRV_LLVM_Translator_jll, SPIRV_Tools_jll
# DEBUG: bypass ParallelTestRunner entirely and run the metal testset directly
# in this process. The worker stdio path was eating libjulia's signal output
# even with Malt's live forwarding enabled, so try the simplest thing: let any
# crash kill the parent and dump its stderr to the CI log without any
# intermediary.
using GPUCompiler, LLVM
using SPIRV_LLVM_Backend_jll, SPIRV_LLVM_Translator_jll, SPIRV_Tools_jll
using Test
using FileCheck

# include all helpers
include(joinpath(@__DIR__, "helpers", "runtime.jl"))
for file in readdir(joinpath(@__DIR__, "helpers"))
if endswith(file, ".jl") && file != "runtime.jl"
include(joinpath(@__DIR__, "helpers", file))
end
include(joinpath(@__DIR__, "helpers", "runtime.jl"))
for file in readdir(joinpath(@__DIR__, "helpers"))
if endswith(file, ".jl") && file != "runtime.jl"
include(joinpath(@__DIR__, "helpers", file))
end
using FileCheck
end

testsuite = find_tests(@__DIR__)
args = parse_args(ARGS)
println("=== bisecting metal `byref aggregates` and `byref primitives` ===")
flush(stdout)

if filter_tests!(testsuite, args)
helperkeys = String[]
for key in collect(keys(testsuite))
startswith(key, "helpers/") && push!(helperkeys, key)
end
for key in helperkeys
delete!(testsuite, key)
end
# Previous CI run pinned the SIGSEGV to the "byref primitives" metal
# testset (entered, then SIGSEGV within ~4s, no later marker). Open up
# both that testset and the preceding "byref aggregates" — since the
# crash may depend on accumulated state from the prior testset — and
# announce each Metal.code_llvm call on stderr.

if LLVM.is_asserts()
delete!(testsuite, "gcn")
end
if VERSION < v"1.11"
delete!(testsuite, "ptx/precompile")
delete!(testsuite, "native/precompile")
end
step(msg...) = (println(stderr, "▶ ", msg...); flush(stderr))

step("byref aggregates: @eval module")
mod_agg = @eval module $(gensym())
kernel(x) = return
end

runtests(GPUCompiler, args; testsuite, init_code)
step("byref aggregates: code_llvm(Tuple{Tuple{Int}}) — non-kernel")
io = IOBuffer()
Metal.code_llvm(io, mod_agg.kernel, Tuple{Tuple{Int}})
step(" done; ", length(take!(io)), " bytes")

step("byref aggregates: code_llvm(Tuple{Tuple{Int}}; kernel=true)")
io = IOBuffer()
Metal.code_llvm(io, mod_agg.kernel, Tuple{Tuple{Int}}; kernel=true)
step(" done; ", length(take!(io)), " bytes")

step("byref primitives: @eval module")
mod_prim = @eval module $(gensym())
kernel(x) = return
end

step("byref primitives: code_llvm(Tuple{Int}) — non-kernel")
io = IOBuffer()
Metal.code_llvm(io, mod_prim.kernel, Tuple{Int})
step(" done; ", length(take!(io)), " bytes")

step("byref primitives: code_llvm(Tuple{Int}; kernel=true)")
io = IOBuffer()
Metal.code_llvm(io, mod_prim.kernel, Tuple{Int}; kernel=true)
step(" done; ", length(take!(io)), " bytes")

step("ALL CALLS COMPLETED")
Loading