Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions src/ptx.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,25 @@

export PTXCompilerTarget

# Wire-format encoding of the feature set, stamped into the `sm_features` LLVM
# global by `finish_module!` and read back by host-side runtime intrinsics (e.g.
# CUDA.jl's `target_feature_set()`).
@enum TargetFeatureSet::UInt32 begin
BaselineFeatures = 0
FamilyFeatures = 1
ArchFeatures = 2
end

Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget
cap::VersionNumber
ptx::VersionNumber = v"6.0" # for compatibility with older versions of CUDA.jl

# subtarget feature set, selecting the suffix on the LLVM CPU name (and `.target`):
# :baseline (no suffix) - forward-compatible (sm_X for any sm_Y >= X)
# :family ('f' suffix) - same-major-family-portable; gates 'f'-tier intrinsics
# :arch ('a' suffix) - locked to one exact CC; unlocks all arch-accel intrinsics
feature_set::Symbol = :baseline

# codegen quirks
## can we emit debug info in the PTX assembly?
debuginfo::Bool = false
Expand All @@ -28,6 +43,7 @@ end
function Base.hash(target::PTXCompilerTarget, h::UInt)
h = hash(target.cap, h)
h = hash(target.ptx, h)
h = hash(target.feature_set, h)

h = hash(target.debuginfo, h)

Expand All @@ -40,6 +56,15 @@ function Base.hash(target::PTXCompilerTarget, h::UInt)
h
end

# format the LLVM CPU / PTX `.target` name for this target
function cpu_name(target::PTXCompilerTarget)
suffix = target.feature_set === :arch ? "a" :
target.feature_set === :family ? "f" :
target.feature_set === :baseline ? "" :
error("PTXCompilerTarget.feature_set must be one of :baseline, :family, :arch; got $(repr(target.feature_set))")
return "sm_$(target.cap.major)$(target.cap.minor)$suffix"
end

source_code(target::PTXCompilerTarget) = "ptx"

llvm_triple(target::PTXCompilerTarget) = Int===Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda"
Expand All @@ -51,7 +76,7 @@ function llvm_machine(target::PTXCompilerTarget)
triple = llvm_triple(target)
t = Target(triple=triple)

tm = TargetMachine(t, triple, "sm_$(target.cap.major)$(target.cap.minor)",
tm = TargetMachine(t, triple, cpu_name(target),
"+ptx$(target.ptx.major)$(target.ptx.minor)")
asm_verbosity!(tm, true)

Expand Down Expand Up @@ -84,7 +109,7 @@ can_vectorize(job::CompilerJob{PTXCompilerTarget}) = true

function Base.show(io::IO, @nospecialize(job::CompilerJob{PTXCompilerTarget}))
print(io, "PTX CompilerJob of ", job.source)
print(io, " for sm_$(job.config.target.cap.major)$(job.config.target.cap.minor)")
print(io, " for ", cpu_name(job.config.target))

job.config.target.minthreads !== nothing && print(io, ", minthreads=$(job.config.target.minthreads)")
job.config.target.maxthreads !== nothing && print(io, ", maxthreads=$(job.config.target.maxthreads)")
Expand All @@ -100,7 +125,7 @@ isintrinsic(@nospecialize(job::CompilerJob{PTXCompilerTarget}), fn::String) =
# XXX: the debuginfo part should be handled by GPUCompiler as it applies to all back-ends.
runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =
"ptx$(job.config.target.ptx.major)$(job.config.target.ptx.minor)" *
"-sm_$(job.config.target.cap.major)$(job.config.target.cap.minor)" *
"-$(cpu_name(job.config.target))" *
"-debuginfo=$(Int(llvm_debug_info(job)))"

function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
Expand All @@ -109,10 +134,14 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
# it possible to 'query' these in device code, relying on LLVM to optimize the checks
# away and generate static code. note that we only do so if there's actual uses of these
# variables; unconditionally creating a gvar would result in duplicate declarations.
for (name, value) in ["sm_major" => job.config.target.cap.major,
"sm_minor" => job.config.target.cap.minor,
"ptx_major" => job.config.target.ptx.major,
"ptx_minor" => job.config.target.ptx.minor]
sm_features = job.config.target.feature_set === :arch ? ArchFeatures :
job.config.target.feature_set === :family ? FamilyFeatures :
BaselineFeatures
for (name, value) in ["sm_major" => job.config.target.cap.major,
"sm_minor" => job.config.target.cap.minor,
"sm_features" => UInt32(sm_features),
"ptx_major" => job.config.target.ptx.major,
"ptx_minor" => job.config.target.ptx.minor]
if haskey(globals(mod), name)
gv = globals(mod)[name]
initializer!(gv, ConstantInt(LLVM.Int32Type(), value))
Expand Down
4 changes: 3 additions & 1 deletion test/helpers/ptx.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@ end
GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime

function create_job(@nospecialize(func), @nospecialize(types);
cap=v"7.0", ptx=v"6.0", feature_set=:baseline,
minthreads=nothing, maxthreads=nothing,
blocks_per_sm=nothing, maxregs=nothing,
kwargs...)
config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs)
target = PTXCompilerTarget(; cap, ptx, feature_set,
minthreads, maxthreads, blocks_per_sm, maxregs)
params = CompilerParams()
config = CompilerConfig(target, params; kernel=false, config_kwargs...)
CompilerJob(source, config), kwargs
Expand Down
44 changes: 44 additions & 0 deletions test/ptx.jl
Original file line number Diff line number Diff line change
Expand Up @@ -419,5 +419,49 @@ end
PTX.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}})
end

@testset "feature_set" begin
# PTXCompilerTarget.feature_set controls the suffix on the LLVM CPU name, which is
# what the NVPTX backend uses to flip `hasArchAccelFeatures()`. Verify it makes its
# way into the `.target` directive that LLVM emits and into the hash.

mod = @eval module $(gensym())
kernel() = return
end

# cpu_name reflects feature_set
@test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0")) == "sm_90"
@test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:baseline)) == "sm_90"
@test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:arch)) == "sm_90a"
@test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"10.0", feature_set=:family)) == "sm_100f"
@test_throws ErrorException GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:bogus))

# hash must discriminate, otherwise two configs differing only on feature_set
# would share the same on-disk runtime slug and collide in the compiler cache.
@test hash(PTXCompilerTarget(cap=v"9.0", feature_set=:baseline)) !=
hash(PTXCompilerTarget(cap=v"9.0", feature_set=:arch))

# LLVM picked up `sm_90a` in v18 (NVPTX.td); older releases don't know the suffix.
if LLVM.version() >= v"18"
@test @filecheck begin
@check ".target sm_90a"
PTX.code_native(mod.kernel, Tuple{}; cap=v"9.0", ptx=v"8.0",
feature_set=:arch, kernel=true, dump_module=true)
end
end
# `sm_100f` (and the rest of the family-/arch-specific Blackwell variants) was added in LLVM 20.
if LLVM.version() >= v"20"
@test @filecheck begin
@check ".target sm_100f"
PTX.code_native(mod.kernel, Tuple{}; cap=v"10.0", ptx=v"8.8",
feature_set=:family, kernel=true, dump_module=true)
end
@test @filecheck begin
@check ".target sm_100a"
PTX.code_native(mod.kernel, Tuple{}; cap=v"10.0", ptx=v"8.6",
feature_set=:arch, kernel=true, dump_module=true)
end
end
end

end
end # NVPTX in LLVM.backends()
Loading