JuliaGPU · maleadt · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/src/ptx.jl b/src/ptx.jl
@@ -4,10 +4,25 @@
 
 export PTXCompilerTarget
 
+# Wire-format encoding of the feature set, stamped into the `sm_features` LLVM
+# global by `finish_module!` and read back by host-side runtime intrinsics (e.g.
+# CUDA.jl's `target_feature_set()`).
+@enum TargetFeatureSet::UInt32 begin
+    BaselineFeatures = 0
+    FamilyFeatures   = 1
+    ArchFeatures     = 2
+end
+
 Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget
     cap::VersionNumber
     ptx::VersionNumber = v"6.0" # for compatibility with older versions of CUDA.jl
 
+    # subtarget feature set, selecting the suffix on the LLVM CPU name (and `.target`):
+    #   :baseline (no suffix)   - forward-compatible (sm_X for any sm_Y >= X)
+    #   :family   ('f' suffix)  - same-major-family-portable; gates 'f'-tier intrinsics
+    #   :arch     ('a' suffix)  - locked to one exact CC; unlocks all arch-accel intrinsics
+    feature_set::Symbol = :baseline
+
     # codegen quirks
     ## can we emit debug info in the PTX assembly?
     debuginfo::Bool = false
@@ -28,6 +43,7 @@ end
 function Base.hash(target::PTXCompilerTarget, h::UInt)
     h = hash(target.cap, h)
     h = hash(target.ptx, h)
+    h = hash(target.feature_set, h)
 
     h = hash(target.debuginfo, h)
 
@@ -40,6 +56,15 @@ function Base.hash(target::PTXCompilerTarget, h::UInt)
     h
 end
 
+# format the LLVM CPU / PTX `.target` name for this target
+function cpu_name(target::PTXCompilerTarget)
+    suffix = target.feature_set === :arch    ? "a" :
+             target.feature_set === :family  ? "f" :
+             target.feature_set === :baseline ? "" :
+             error("PTXCompilerTarget.feature_set must be one of :baseline, :family, :arch; got $(repr(target.feature_set))")
+    return "sm_$(target.cap.major)$(target.cap.minor)$suffix"
+end
+
 source_code(target::PTXCompilerTarget) = "ptx"
 
 llvm_triple(target::PTXCompilerTarget) = Int===Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda"
@@ -51,7 +76,7 @@ function llvm_machine(target::PTXCompilerTarget)
     triple = llvm_triple(target)
     t = Target(triple=triple)
 
-    tm = TargetMachine(t, triple, "sm_$(target.cap.major)$(target.cap.minor)",
+    tm = TargetMachine(t, triple, cpu_name(target),
                        "+ptx$(target.ptx.major)$(target.ptx.minor)")
     asm_verbosity!(tm, true)
 
@@ -84,7 +109,7 @@ can_vectorize(job::CompilerJob{PTXCompilerTarget}) = true
 
 function Base.show(io::IO, @nospecialize(job::CompilerJob{PTXCompilerTarget}))
     print(io, "PTX CompilerJob of ", job.source)
-    print(io, " for sm_$(job.config.target.cap.major)$(job.config.target.cap.minor)")
+    print(io, " for ", cpu_name(job.config.target))
 
     job.config.target.minthreads !== nothing && print(io, ", minthreads=$(job.config.target.minthreads)")
     job.config.target.maxthreads !== nothing && print(io, ", maxthreads=$(job.config.target.maxthreads)")
@@ -100,7 +125,7 @@ isintrinsic(@nospecialize(job::CompilerJob{PTXCompilerTarget}), fn::String) =
 # XXX: the debuginfo part should be handled by GPUCompiler as it applies to all back-ends.
 runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =
     "ptx$(job.config.target.ptx.major)$(job.config.target.ptx.minor)" *
-    "-sm_$(job.config.target.cap.major)$(job.config.target.cap.minor)" *
+    "-$(cpu_name(job.config.target))" *
     "-debuginfo=$(Int(llvm_debug_info(job)))"
 
 function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
@@ -109,10 +134,14 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
     # it possible to 'query' these in device code, relying on LLVM to optimize the checks
     # away and generate static code. note that we only do so if there's actual uses of these
     # variables; unconditionally creating a gvar would result in duplicate declarations.
-    for (name, value) in ["sm_major"  => job.config.target.cap.major,
-                          "sm_minor"  => job.config.target.cap.minor,
-                          "ptx_major" => job.config.target.ptx.major,
-                          "ptx_minor" => job.config.target.ptx.minor]
+    sm_features = job.config.target.feature_set === :arch    ? ArchFeatures :
+                  job.config.target.feature_set === :family  ? FamilyFeatures :
+                                                               BaselineFeatures
+    for (name, value) in ["sm_major"    => job.config.target.cap.major,
+                          "sm_minor"    => job.config.target.cap.minor,
+                          "sm_features" => UInt32(sm_features),
+                          "ptx_major"   => job.config.target.ptx.major,
+                          "ptx_minor"   => job.config.target.ptx.minor]
         if haskey(globals(mod), name)
             gv = globals(mod)[name]
             initializer!(gv, ConstantInt(LLVM.Int32Type(), value))

diff --git a/test/helpers/ptx.jl b/test/helpers/ptx.jl
@@ -36,12 +36,14 @@ end
 GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types);
+                    cap=v"7.0", ptx=v"6.0", feature_set=:baseline,
                     minthreads=nothing, maxthreads=nothing,
                     blocks_per_sm=nothing, maxregs=nothing,
                     kwargs...)
     config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
-    target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs)
+    target = PTXCompilerTarget(; cap, ptx, feature_set,
+                                 minthreads, maxthreads, blocks_per_sm, maxregs)
     params = CompilerParams()
     config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs

diff --git a/test/ptx.jl b/test/ptx.jl
@@ -419,5 +419,49 @@ end
     PTX.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}})
 end
 
+@testset "feature_set" begin
+    # PTXCompilerTarget.feature_set controls the suffix on the LLVM CPU name, which is
+    # what the NVPTX backend uses to flip `hasArchAccelFeatures()`. Verify it makes its
+    # way into the `.target` directive that LLVM emits and into the hash.
+
+    mod = @eval module $(gensym())
+        kernel() = return
+    end
+
+    # cpu_name reflects feature_set
+    @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0")) == "sm_90"
+    @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:baseline)) == "sm_90"
+    @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:arch)) == "sm_90a"
+    @test GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"10.0", feature_set=:family)) == "sm_100f"
+    @test_throws ErrorException GPUCompiler.cpu_name(PTXCompilerTarget(cap=v"9.0", feature_set=:bogus))
+
+    # hash must discriminate, otherwise two configs differing only on feature_set
+    # would share the same on-disk runtime slug and collide in the compiler cache.
+    @test hash(PTXCompilerTarget(cap=v"9.0", feature_set=:baseline)) !=
+          hash(PTXCompilerTarget(cap=v"9.0", feature_set=:arch))
+
+    # LLVM picked up `sm_90a` in v18 (NVPTX.td); older releases don't know the suffix.
+    if LLVM.version() >= v"18"
+        @test @filecheck begin
+            @check ".target sm_90a"
+            PTX.code_native(mod.kernel, Tuple{}; cap=v"9.0", ptx=v"8.0",
+                            feature_set=:arch, kernel=true, dump_module=true)
+        end
+    end
+    # `sm_100f` (and the rest of the family-/arch-specific Blackwell variants) was added in LLVM 20.
+    if LLVM.version() >= v"20"
+        @test @filecheck begin
+            @check ".target sm_100f"
+            PTX.code_native(mod.kernel, Tuple{}; cap=v"10.0", ptx=v"8.8",
+                            feature_set=:family, kernel=true, dump_module=true)
+        end
+        @test @filecheck begin
+            @check ".target sm_100a"
+            PTX.code_native(mod.kernel, Tuple{}; cap=v"10.0", ptx=v"8.6",
+                            feature_set=:arch, kernel=true, dump_module=true)
+        end
+    end
+end
+
 end
 end # NVPTX in LLVM.backends()