From 8a477d642269d94ff6e07957770539e0a00b7163 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 07:02:50 +0200 Subject: [PATCH 1/2] PTX: Add support for selecting the GPU feature set. --- src/ptx.jl | 22 +++++++++++++++++++--- test/helpers/ptx.jl | 4 +++- test/ptx.jl | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/src/ptx.jl b/src/ptx.jl index 66880850..18c6e014 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -8,6 +8,12 @@ Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget cap::VersionNumber ptx::VersionNumber = v"6.0" # for compatibility with older versions of CUDA.jl + # subtarget feature set, selecting the suffix on the LLVM CPU name (and `.target`): + # :baseline (no suffix) - forward-compatible (sm_X for any sm_Y >= X) + # :family ('f' suffix) - same-major-family-portable; gates 'f'-tier intrinsics + # :arch ('a' suffix) - locked to one exact CC; unlocks all arch-accel intrinsics + feature_set::Symbol = :baseline + # codegen quirks ## can we emit debug info in the PTX assembly? debuginfo::Bool = false @@ -28,6 +34,7 @@ end function Base.hash(target::PTXCompilerTarget, h::UInt) h = hash(target.cap, h) h = hash(target.ptx, h) + h = hash(target.feature_set, h) h = hash(target.debuginfo, h) @@ -40,6 +47,15 @@ function Base.hash(target::PTXCompilerTarget, h::UInt) h end +# format the LLVM CPU / PTX `.target` name for this target +function cpu_name(target::PTXCompilerTarget) + suffix = target.feature_set === :arch ? "a" : + target.feature_set === :family ? "f" : + target.feature_set === :baseline ? "" : + error("PTXCompilerTarget.feature_set must be one of :baseline, :family, :arch; got $(repr(target.feature_set))") + return "sm_$(target.cap.major)$(target.cap.minor)$suffix" +end + source_code(target::PTXCompilerTarget) = "ptx" llvm_triple(target::PTXCompilerTarget) = Int===Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda" @@ -51,7 +67,7 @@ function llvm_machine(target::PTXCompilerTarget) triple = llvm_triple(target) t = Target(triple=triple) - tm = TargetMachine(t, triple, "sm_$(target.cap.major)$(target.cap.minor)", + tm = TargetMachine(t, triple, cpu_name(target), "+ptx$(target.ptx.major)$(target.ptx.minor)") asm_verbosity!(tm, true) @@ -84,7 +100,7 @@ can_vectorize(job::CompilerJob{PTXCompilerTarget}) = true function Base.show(io::IO, @nospecialize(job::CompilerJob{PTXCompilerTarget})) print(io, "PTX CompilerJob of ", job.source) - print(io, " for sm_$(job.config.target.cap.major)$(job.config.target.cap.minor)") + print(io, " for ", cpu_name(job.config.target)) job.config.target.minthreads !== nothing && print(io, ", minthreads=$(job.config.target.minthreads)") job.config.target.maxthreads !== nothing && print(io, ", maxthreads=$(job.config.target.maxthreads)") @@ -100,7 +116,7 @@ isintrinsic(@nospecialize(job::CompilerJob{PTXCompilerTarget}), fn::String) = # XXX: the debuginfo part should be handled by GPUCompiler as it applies to all back-ends. runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) = "ptx$(job.config.target.ptx.major)$(job.config.target.ptx.minor)" * - "-sm_$(job.config.target.cap.major)$(job.config.target.cap.minor)" * + "-$(cpu_name(job.config.target))" * "-debuginfo=$(Int(llvm_debug_info(job)))" function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), diff --git a/test/helpers/ptx.jl b/test/helpers/ptx.jl index e82416bc..634b59de 100644 --- a/test/helpers/ptx.jl +++ b/test/helpers/ptx.jl @@ -36,12 +36,14 @@ end GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime function create_job(@nospecialize(func), @nospecialize(types); + cap=v"7.0", ptx=v"6.0", feature_set=:baseline, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing, kwargs...) config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) - target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs) + target = PTXCompilerTarget(; cap, ptx, feature_set, + minthreads, maxthreads, blocks_per_sm, maxregs) params = CompilerParams() config = CompilerConfig(target, params; kernel=false, config_kwargs...) CompilerJob(source, config), kwargs diff --git a/test/ptx.jl b/test/ptx.jl index 7010917a..1bda96ef 100644 --- a/test/ptx.jl +++ b/test/ptx.jl @@ -419,5 +419,49 @@ end PTX.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}}) end +@testset "feature_set" begin + # PTXCompilerTarget.feature_set controls the suffix on the LLVM CPU name, which is + # what the NVPTX backend uses to flip `hasArchAccelFeatures()`. Verify it makes its + # way into the `.target` directive that LLVM emits and into the hash. + + mod = @eval module $(gensym()) + kernel() = return + end + + # cpu_name reflects feature_set + @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0")) == "sm_90" + @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:baseline)) == "sm_90" + @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:arch)) == "sm_90a" + @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"10.0", feature_set=:family)) == "sm_100f" + @test_throws ErrorException GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:bogus)) + + # hash must discriminate, otherwise two configs differing only on feature_set + # would share the same on-disk runtime slug and collide in the compiler cache. + @test hash(PTXCompilerTarget(cap=v"9.0", feature_set=:baseline)) != + hash(PTXCompilerTarget(cap=v"9.0", feature_set=:arch)) + + # LLVM picked up `sm_90a` in v18 (NVPTX.td); older releases don't know the suffix. + if LLVM.version() >= v"18" + @test @filecheck begin + @check ".target sm_90a" + PTX.code_native(mod.kernel, Tuple{}; cap=v"9.0", ptx=v"8.0", + feature_set=:arch, kernel=true, dump_module=true) + end + end + # `sm_100f` (and the rest of the family-/arch-specific Blackwell variants) was added in LLVM 20. + if LLVM.version() >= v"20" + @test @filecheck begin + @check ".target sm_100f" + PTX.code_native(mod.kernel, Tuple{}; cap=v"10.0", ptx=v"8.8", + feature_set=:family, kernel=true, dump_module=true) + end + @test @filecheck begin + @check ".target sm_100a" + PTX.code_native(mod.kernel, Tuple{}; cap=v"10.0", ptx=v"8.6", + feature_set=:arch, kernel=true, dump_module=true) + end + end +end + end end # NVPTX in LLVM.backends() From 2f7c254dd85748e7b88428d941887f972f268149 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 09:12:45 +0200 Subject: [PATCH 2/2] Expose feature set back to the calling code. --- src/ptx.jl | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/ptx.jl b/src/ptx.jl index 18c6e014..bd824642 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -4,6 +4,15 @@ export PTXCompilerTarget +# Wire-format encoding of the feature set, stamped into the `sm_features` LLVM +# global by `finish_module!` and read back by host-side runtime intrinsics (e.g. +# CUDA.jl's `target_feature_set()`). +@enum TargetFeatureSet::UInt32 begin + BaselineFeatures = 0 + FamilyFeatures = 1 + ArchFeatures = 2 +end + Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget cap::VersionNumber ptx::VersionNumber = v"6.0" # for compatibility with older versions of CUDA.jl @@ -125,10 +134,14 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), # it possible to 'query' these in device code, relying on LLVM to optimize the checks # away and generate static code. note that we only do so if there's actual uses of these # variables; unconditionally creating a gvar would result in duplicate declarations. - for (name, value) in ["sm_major" => job.config.target.cap.major, - "sm_minor" => job.config.target.cap.minor, - "ptx_major" => job.config.target.ptx.major, - "ptx_minor" => job.config.target.ptx.minor] + sm_features = job.config.target.feature_set === :arch ? ArchFeatures : + job.config.target.feature_set === :family ? FamilyFeatures : + BaselineFeatures + for (name, value) in ["sm_major" => job.config.target.cap.major, + "sm_minor" => job.config.target.cap.minor, + "sm_features" => UInt32(sm_features), + "ptx_major" => job.config.target.ptx.major, + "ptx_minor" => job.config.target.ptx.minor] if haskey(globals(mod), name) gv = globals(mod)[name] initializer!(gv, ConstantInt(LLVM.Int32Type(), value))