diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index bab293aba3..2c9992781e 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.10"
+GPUCompiler = "1.12"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"
diff --git a/CUDACore/lib/cudadrv/state.jl b/CUDACore/lib/cudadrv/state.jl
index 6c03564bb8..32159136fa 100644
--- a/CUDACore/lib/cudadrv/state.jl
+++ b/CUDACore/lib/cudadrv/state.jl
@@ -227,7 +227,7 @@ function context(dev::CuDevice)
                maxlog=1, _id=devidx)
     end
     # ... or too new
-    if !in(capability(dev), cuda_compat().cap)
+    if !in(capability(dev), ptxas_compat().cap)
         @warn("""Your $(name(dev)) GPU (compute capability $(capability(dev).major).$(capability(dev).minor)) is not fully supported by CUDA $(runtime_version()).
                  Some functionality may be broken. Ensure you are using the latest version of CUDA.jl in combination with an up-to-date NVIDIA driver.
                  If that does not help, please file an issue to add support for the latest CUDA toolkit.""",
diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl
index fa2bfa2b45..c0332759ff 100644
--- a/CUDACore/src/CUDACore.jl
+++ b/CUDACore/src/CUDACore.jl
@@ -80,6 +80,7 @@ include("../lib/cudadrv/CUDAdrv.jl")
 
 # essential stuff
 include("initialization.jl")
+include("compiler/sm.jl")
 include("compatibility.jl")
 include("debug.jl")
 
diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index afb833b32a..0f512a0255 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -1,15 +1,25 @@
 # compatibility of Julia, CUDA and LLVM
 
-# NOTE: Target architectures with suffix “a”, such as sm_90a, include
-# architecture-accelerated features that are supported on the specified architecture only,
-# hence such targets do not follow the onion layer model. Therefore, PTX code generated for
-# such targets cannot be run on later generation devices. Architecture-accelerated features
-# can only be used with targets that support these features.
-
 const lowest = v"0"
 const highest = v"999"
 
 
+# PTX compilation targets come in three feature-set flavors (carried on `SMVersion`),
+# selected via the suffix on the `.target` directive (and the matching `--gpu-name`
+# to ptxas):
+#
+#   - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
+#     for sm_X runs on any sm_Y with Y >= X (onion model).
+#   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
+#     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
+#   - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
+#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
+#
+# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
+# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
+# below: an unsupported combination simply has no entry.
+
+
 ## version range
 
 struct VersionRange
@@ -26,12 +36,12 @@ Base.intersect(v::VersionNumber, r::VersionRange) =
     v > r.upper ? (v:r.upper) : (v:v)
 
 
-## devices supported by the CUDA toolkit
+## devices supported by ptxas
 
 # Source:
 # - https://en.wikipedia.org/wiki/CUDA#GPUs_supported
 # - ptxas |& grep -A 10 '\--gpu-name'
-const cuda_cap_db = Dict(
+const ptxas_cap_db = Dict(
     v"1.0"   => between(lowest, v"6.5"),
     v"1.1"   => between(lowest, v"6.5"),
     v"1.2"   => between(lowest, v"6.5"),
@@ -63,9 +73,9 @@ const cuda_cap_db = Dict(
     v"12.1"  => between(v"12.9", highest),
 )
 
-function cuda_cap_support(ver::VersionNumber)
+function ptxas_cap_support(ver::VersionNumber)
     caps = Set{VersionNumber}()
-    for (cap,r) in cuda_cap_db
+    for (cap,r) in ptxas_cap_db
         if ver in r
             push!(caps, cap)
         end
@@ -74,10 +84,10 @@ function cuda_cap_support(ver::VersionNumber)
 end
 
 
-## PTX ISAs supported by the CUDA toolkit
+## PTX ISAs supported by ptxas
 
 # Source: PTX ISA document, Release History table
-const cuda_ptx_db = Dict(
+const ptxas_ptx_db = Dict(
     v"1.0" => between(v"1.0", highest),
     v"1.1" => between(v"1.1", highest),
     v"1.2" => between(v"2.0", highest),
@@ -125,9 +135,9 @@ const cuda_ptx_db = Dict(
     v"9.2" => between(v"13.2", highest),
 )
 
-function cuda_ptx_support(ver::VersionNumber)
+function ptxas_ptx_support(ver::VersionNumber)
     caps = Set{VersionNumber}()
-    for (cap,r) in cuda_ptx_db
+    for (cap,r) in ptxas_ptx_db
         if ver in r
             push!(caps, cap)
         end
@@ -138,52 +148,54 @@ end
 
 ## devices supported by each PTX ISA
 
-# Source: PTX ISA document, Release History table
-const ptx_cap_db = Dict(
-    v"1.0"   => between(v"1.0", highest),
-    v"1.1"   => between(v"1.0", highest),
-    v"1.2"   => between(v"1.2", highest),
-    v"1.3"   => between(v"1.2", highest),
-    v"2.0"   => between(v"2.0", highest),
-    v"3.0"   => between(v"3.1", highest),
-    v"3.2"   => between(v"4.0", highest),
-    v"3.5"   => between(v"3.1", highest),
-    v"3.7"   => between(v"4.1", highest),
-    v"5.0"   => between(v"4.0", highest),
-    v"5.2"   => between(v"4.1", highest),
-    v"5.3"   => between(v"4.2", highest),
-    v"6.0"   => between(v"5.0", highest),
-    v"6.1"   => between(v"5.0", highest),
-    v"6.2"   => between(v"5.0", highest),
-    v"7.0"   => between(v"6.0", highest),
-    v"7.2"   => between(v"6.1", highest),
-    v"7.5"   => between(v"6.3", highest),
-    v"8.0"   => between(v"7.0", highest),
-    v"8.6"   => between(v"7.1", highest),
-    v"8.7"   => between(v"7.4", highest),
-    v"8.9"   => between(v"7.8", highest),
-    v"9.0"   => between(v"7.8", highest),
-    #v"9.0a" => between(v"8.0", highest)
-    v"10.0"  => between(v"8.6", highest),
-    #v"10.0a"=> between(v"8.6", highest),
-    #v"10.0f"=> between(v"8.8", highest),
-    v"10.1"  => between(v"8.6", highest),
-    #v"10.1a"=> between(v"8.6", highest),
-    #v"10.1f"=> between(v"8.8", highest),
-    v"10.3"  => between(v"8.8", highest),
-    #v"10.3a"=> between(v"8.8", highest),
-    #v"10.3f"=> between(v"8.8", highest),
-    v"12.0"  => between(v"8.7", highest),
-    #v"12.0a"=> between(v"8.7", highest),
-    #v"12.0f"=> between(v"8.8", highest),
-    v"12.1"  => between(v"8.8", highest),
-    #v"12.1a"=> between(v"8.8", highest),
-    #v"12.1f"=> between(v"8.8", highest),
+# Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants
+# were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8.
+const ptx_sm_db = Dict{SMVersion, VersionRange}(
+    sm"10"   => between(v"1.0", highest),
+    sm"11"   => between(v"1.0", highest),
+    sm"12"   => between(v"1.2", highest),
+    sm"13"   => between(v"1.2", highest),
+    sm"20"   => between(v"2.0", highest),
+    sm"30"   => between(v"3.1", highest),
+    sm"32"   => between(v"4.0", highest),
+    sm"35"   => between(v"3.1", highest),
+    sm"37"   => between(v"4.1", highest),
+    sm"50"   => between(v"4.0", highest),
+    sm"52"   => between(v"4.1", highest),
+    sm"53"   => between(v"4.2", highest),
+    sm"60"   => between(v"5.0", highest),
+    sm"61"   => between(v"5.0", highest),
+    sm"62"   => between(v"5.0", highest),
+    sm"70"   => between(v"6.0", highest),
+    sm"72"   => between(v"6.1", highest),
+    sm"75"   => between(v"6.3", highest),
+    sm"80"   => between(v"7.0", highest),
+    sm"86"   => between(v"7.1", highest),
+    sm"87"   => between(v"7.4", highest),
+    sm"89"   => between(v"7.8", highest),
+    sm"90"   => between(v"7.8", highest),
+    sm"90a"  => between(v"8.0", highest),
+    sm"100"  => between(v"8.6", highest),
+    sm"100a" => between(v"8.6", highest),
+    sm"100f" => between(v"8.8", highest),
+    sm"101"  => between(v"8.6", highest),
+    sm"101a" => between(v"8.6", highest),
+    sm"101f" => between(v"8.8", highest),
+    sm"103"  => between(v"8.8", highest),
+    sm"103a" => between(v"8.8", highest),
+    sm"103f" => between(v"8.8", highest),
+    sm"120"  => between(v"8.7", highest),
+    sm"120a" => between(v"8.7", highest),
+    sm"120f" => between(v"8.8", highest),
+    sm"121"  => between(v"8.8", highest),
+    sm"121a" => between(v"8.8", highest),
+    sm"121f" => between(v"8.8", highest),
 )
 
-function ptx_cap_support(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap,r) in ptx_cap_db
+# Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`.
+function ptx_sm_support(ver::VersionNumber)
+    caps = Set{SMVersion}()
+    for (cap, r) in ptx_sm_db
         if ver in r
             push!(caps, cap)
         end
@@ -194,44 +206,52 @@ end
 
 ## devices supported by the LLVM NVPTX back-end
 
-# Source: LLVM/lib/Target/NVPTX/NVPTX.td
-const llvm_cap_db = Dict(
-    v"2.0"   => between(v"3.2", highest),
-    v"2.1"   => between(v"3.2", highest),
-    v"3.0"   => between(v"3.2", highest),
-    v"3.2"   => between(v"3.7", highest),
-    v"3.5"   => between(v"3.2", highest),
-    v"3.7"   => between(v"3.7", highest),
-    v"5.0"   => between(v"3.5", highest),
-    v"5.2"   => between(v"3.7", highest),
-    v"5.3"   => between(v"3.7", highest),
-    v"6.0"   => between(v"3.9", highest),
-    v"6.1"   => between(v"3.9", highest),
-    v"6.2"   => between(v"3.9", highest),
-    v"7.0"   => between(v"6", highest),
-    v"7.2"   => between(v"7", highest),
-    v"7.5"   => between(v"8", highest),
-    v"8.0"   => between(v"11", highest),
-    v"8.6"   => between(v"13", highest),
-    v"8.7"   => between(v"16", highest),
-    v"8.9"   => between(v"16", highest),
-    v"9.0"   => between(v"16", highest),
-    #v"9.0a" => between(v"18", highest),
-    v"10.0"  => between(v"20", highest),
-    #v"10.0a"=> between(v"20", highest),
-    v"10.1"  => between(v"20", highest),
-    #v"10.1a"=> between(v"20", highest),
-    v"10.3"  => between(v"21", highest),
-    #v"10.3a"=> between(v"21", highest),
-    v"12.0"  => between(v"20", highest),
-    #v"12.0a"=> between(v"20", highest),
-    v"12.1"  => between(v"21", highest),
-    #v"12.1a"=> between(v"21", highest),
+# Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up
+# here as a separate entry; without an entry LLVM does not know the variant CPU name and
+# constructing a TargetMachine with it would fall back to a generic subtarget.
+const llvm_sm_db = Dict{SMVersion, VersionRange}(
+    sm"20"   => between(v"3.2", highest),
+    sm"21"   => between(v"3.2", highest),
+    sm"30"   => between(v"3.2", highest),
+    sm"32"   => between(v"3.7", highest),
+    sm"35"   => between(v"3.2", highest),
+    sm"37"   => between(v"3.7", highest),
+    sm"50"   => between(v"3.5", highest),
+    sm"52"   => between(v"3.7", highest),
+    sm"53"   => between(v"3.7", highest),
+    sm"60"   => between(v"3.9", highest),
+    sm"61"   => between(v"3.9", highest),
+    sm"62"   => between(v"3.9", highest),
+    sm"70"   => between(v"6", highest),
+    sm"72"   => between(v"7", highest),
+    sm"75"   => between(v"8", highest),
+    sm"80"   => between(v"11", highest),
+    sm"86"   => between(v"13", highest),
+    sm"87"   => between(v"16", highest),
+    sm"89"   => between(v"16", highest),
+    sm"90"   => between(v"16", highest),
+    sm"90a"  => between(v"18", highest),
+    sm"100"  => between(v"20", highest),
+    sm"100a" => between(v"20", highest),
+    sm"100f" => between(v"21", highest),
+    sm"101"  => between(v"20", highest),
+    sm"101a" => between(v"20", highest),
+    sm"101f" => between(v"21", highest),
+    sm"103"  => between(v"21", highest),
+    sm"103a" => between(v"21", highest),
+    sm"103f" => between(v"21", highest),
+    sm"120"  => between(v"20", highest),
+    sm"120a" => between(v"20", highest),
+    sm"120f" => between(v"21", highest),
+    sm"121"  => between(v"21", highest),
+    sm"121a" => between(v"21", highest),
+    sm"121f" => between(v"21", highest),
 )
 
-function llvm_cap_support(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap,r) in llvm_cap_db
+# Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`.
+function llvm_sm_support(ver::VersionNumber)
+    caps = Set{SMVersion}()
+    for (cap, r) in llvm_sm_db
         if ver in r
             push!(caps, cap)
         end
@@ -295,32 +315,14 @@ end
 function llvm_compat(version=LLVM.version())
     LLVM.InitializeNVPTXTarget()
 
-    cap_support = sort(collect(llvm_cap_support(version)))
-    ptx_support = sort(collect(llvm_ptx_support(version)))
-
-    return (cap=cap_support, ptx=ptx_support)
-end
-
-function cuda_compat(runtime=runtime_version(), compiler=compiler_version())
-    # we don't have to check the driver version, because it offers backwards compatbility
-    # beyond the CUDA toolkit version (e.g. R580 for CUDA 13 still supports Volta as
-    # deprecated in CUDA 13), and we don't have a reliable way to query the actual version
-    # as NVML isn't available on all platforms. let's instead simply assume that unsupported
-    # devices will not be exposed to the CUDA runtime and thus won't be visible to us.
-
-    # the compiler and runtime are versioned independently (and either can come from a
-    # local install), so we need to consider both:
-    # - device caps are dropped when either ptxas can't emit for them or the runtime
-    #   libraries drop them. take the intersection of both supported sets.
-    # - PTX ISA availability is a property of ptxas; the runtime doesn't care which ISA
-    #   compiled cubin came from.
-    cap_support = sort(collect(intersect(cuda_cap_support(runtime),
-                                         cuda_cap_support(compiler))))
-    ptx_support = sort(collect(cuda_ptx_support(compiler)))
-
-    return (cap=cap_support, ptx=ptx_support)
+    # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
+    # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
+    # support is per-CC -- the names track the value type.
+    return (sm=llvm_sm_support(version),
+            ptx=llvm_ptx_support(version))
 end
 
-function ptx_compat(ptx)
-    return (cap=ptx_cap_support(ptx),)
+function ptxas_compat(version=compiler_version())
+    return (cap=ptxas_cap_support(version),
+            ptx=ptxas_ptx_support(version))
 end
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 1b60e42602..bee91fbca1 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -1,12 +1,12 @@
 ## gpucompiler interface implementation
 
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
-    cap::VersionNumber
+    sm::SMVersion
     ptx::VersionNumber
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
-    h = hash(params.cap, h)
+    h = hash(params.sm, h)
     h = hash(params.ptx, h)
 
     return h
@@ -119,10 +119,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap)
+function rewrite_ptx_header(asm, ptx::VersionNumber, sm::SMVersion)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
+        r"\.target sm_\d+\w*" => ".target $(cpu_name(sm))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -145,9 +145,16 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap) = job.config.params
-    if job.config.target.ptx != ptx || job.config.target.cap != cap
-        asm = rewrite_ptx_header(asm, ptx, cap)
+    # The rewrite stamps `.target`/`.version` with the *requested* (cuda-side) values.
+    # When the GPUCompiler-side target matches, LLVM already emits the right header
+    # (including the `a`/`f` suffix, via the CPU name); we only rewrite when they differ,
+    # e.g. when we had to clamp the target down for LLVM compatibility.
+    (; ptx, sm) = job.config.params
+    needs_rewrite = job.config.target.ptx != ptx ||
+                    job.config.target.cap != base_version(sm) ||
+                    job.config.target.feature_set !== sm.feature_set
+    if needs_rewrite
+        asm = rewrite_ptx_header(asm, ptx, sm)
     end
 
     return asm
@@ -179,66 +186,86 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
-    # determine the toolchain
+                                         arch=nothing, cap=nothing, ptx=nothing, kwargs...)
+    # `cap=` is the deprecated old name for `arch=` (matches nvcc/ptxas `-arch`).
+    if cap !== nothing
+        arch === nothing ||
+            throw(ArgumentError("pass either `arch=` or the deprecated `cap=`, not both"))
+        Base.depwarn("the `cap=` kwarg is deprecated; use `arch=` (matching nvcc/ptxas `-arch`) instead.",
+                     :cufunction)
+        arch = cap
+    end
+    # `SMVersion` is the universal normalizer: identity for an SMVersion, baseline-promotes
+    # a VersionNumber, parses a string. Anything else falls out as a MethodError naturally.
+    arch === nothing || (arch = SMVersion(arch))
+
+    # inspect the toolchain
     llvm_support = llvm_compat()
-    cuda_support = cuda_compat()
+    ptxas_support = ptxas_compat()
 
-    # determine the PTX ISA to use. we want at least 6.2, but will use newer if possible.
-    requested_ptx = something(ptx, v"6.2")
-    llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
-    cuda_ptxs = filter(>=(requested_ptx), cuda_support.ptx)
+    # determine the PTX ISA to use.
     if ptx !== nothing
-        # the user requested a specific PTX ISA
-        ## use the highest ISA supported by LLVM
-        isempty(llvm_ptxs) &&
+        # explicit request: take it exactly, validating against the toolchain
+        ptx in llvm_support.ptx ||
             error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())")
-        llvm_ptx = maximum(llvm_ptxs)
-        ## use the ISA as-is to invoke CUDA
-        cuda_ptx = ptx
+        ptx in ptxas_support.ptx ||
+            error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())")
+        llvm_ptx = ptxas_ptx = ptx
     else
-        # try to do the best thing (i.e., use the newest PTX ISA)
-        # XXX: is it safe to just use the latest PTX ISA? isn't it possible for, e.g.,
-        #      instructions to get deprecated?
+        # default: pick the newest PTX ISA supported by the toolchain (>=v6.2)
+        requested_ptx = v"6.2"
+        llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
+        ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx)
         isempty(llvm_ptxs) &&
             error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())")
-        llvm_ptx = maximum(llvm_ptxs)
-        isempty(cuda_ptxs) &&
-            error("CUDA.jl requires PTX $requested_ptx, which is not supported by CUDA $(compiler_version())")
-        cuda_ptx = maximum(cuda_ptxs)
+        isempty(ptxas_ptxs) &&
+            error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())")
+        ptxas_ptx = maximum(ptxas_ptxs)
+        llvm_ptx = min(maximum(llvm_ptxs), ptxas_ptx)
     end
 
-    # determine the compute capabilities to use. this should match the capability of the
-    # current device, but if LLVM doesn't support it, we can target an older capability
-    # and pass a different `-arch` to `ptxas`.
-    ptx_support = ptx_compat(cuda_ptx)
-    requested_cap = @something(cap, min(capability(dev), maximum(ptx_support.cap)))
-    llvm_caps = filter(<=(requested_cap), llvm_support.cap)
-    if cap !== nothing
-        ## use the highest capability supported by LLVM
-        isempty(llvm_caps) &&
-            error("Requested compute capability $cap is not supported by LLVM $(LLVM.version())")
-        llvm_cap = maximum(llvm_caps)
-        ## use the capability as-is to invoke CUDA
-        cuda_cap = cap
+    # when selecting compute capabilities, we prefer the most recent one, as
+    # well as prefer to use architecture-accelerated features when available.
+    fs_rank(fs::Symbol) = fs === :arch ? 2 : fs === :family ? 1 : 0
+    sm_key(sm::SMVersion) = (base_version(sm), fs_rank(sm.feature_set))
+
+    # determine the compute capability to use.
+    ## ptxas
+    ptx_sms = ptx_sm_support(ptxas_ptx)
+    if arch !== nothing
+        # explicit request: take it as-is, validating against the PTX ISA
+        arch in ptx_sms ||
+            error("$(cpu_name(arch)) is not supported by PTX ISA $(ptxas_ptx)")
+        ptxas_sm = arch
     else
-        ## use the highest capability supported by LLVM
-        isempty(llvm_caps) &&
-            error("Compute capability $(requested_cap) is not supported by LLVM $(LLVM.version())")
-        llvm_cap = maximum(llvm_caps)
-        ## use the highest capability supported by CUDA
-        cuda_caps = filter(<=(capability(dev)), cuda_support.cap)
-        isempty(cuda_caps) &&
-            error("Compute capability $(requested_cap) is not supported by CUDA $(runtime_version())")
-        cuda_cap = maximum(cuda_caps)
+        # pick the most specific capability the selected PTX ISA supports whose cubin
+        # would actually load on the current device. For baseline that's the onion model;
+        # `:arch` requires an exact CC match, `:family` a same-family match.
+        ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_sms)
+        isempty(ptxas_candidates) &&
+            error("Compute capability $(capability(dev)) is not supported by ptxas " *
+                  "$(compiler_version()) at PTX ISA $(ptxas_ptx)")
+        ptxas_sm = argmax(sm_key, ptxas_candidates)
+    end
+    ## LLVM
+    if ptxas_sm in llvm_support.sm
+        llvm_sm = ptxas_sm
+    else
+        # Exact `ptxas_sm` unavailable in LLVM. Fall back to baseline LLVM at a
+        # lower base, since arch/family features don't carry across versions.
+        baseline_candidates = filter(llvm_support.sm) do sm
+            sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
+        end
+        isempty(baseline_candidates) &&
+            error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())")
+        llvm_sm = argmax(sm_key, baseline_candidates)
     end
-
-    # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
-    debuginfo = compiler_version() >= v"11.7"
 
     # create GPUCompiler objects
-    target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx)
+    target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx,
+                                 feature_set=llvm_sm.feature_set,
+                                 debuginfo=true, kwargs...)
+    params = CUDACompilerParams(; sm=ptxas_sm, ptx=ptxas_ptx)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -273,9 +300,9 @@ function compile(@nospecialize(job::CompilerJob))
         push!(ptxas_opts, "--compile-only")
     end
 
-    ptx = job.config.params.ptx
-    cap = job.config.params.cap
-    arch = "sm_$(cap.major)$(cap.minor)"
+    (; ptx, sm) = job.config.params
+    cap = base_version(sm)
+    arch = cpu_name(sm)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
@@ -288,7 +315,7 @@ function compile(@nospecialize(job::CompilerJob))
     end
     if param_usage > param_limit
         msg = """Kernel invocation uses too much parameter memory.
-                 $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by sm_$(cap.major)$(cap.minor) / PTX v$(ptx.major).$(ptx.minor)."""
+                 $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by $(arch) / PTX v$(ptx.major).$(ptx.minor)."""
 
         try
             details = "\n\nRelevant parameters:"
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index 130d049e7c..bfb7d963ce 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :arch, :cap, :ptx]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
 
@@ -433,7 +433,12 @@ The following keyword arguments are supported:
 - `name`: override the name that the kernel will have in the generated code
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
-- `cap` and `ptx`: to override the compute capability and PTX version to compile for
+- `arch` and `ptx`: override the GPU architecture (matching nvcc/ptxas `-arch`) and the
+  PTX ISA version to compile for. `arch` accepts either an [`SMVersion`](@ref) via the
+  `sm"..."` string macro (e.g. `arch=sm"103a"` for architecture-accelerated codegen on
+  CC 10.3, or `arch=sm"100f"` for family-portable Blackwell codegen) or a `VersionNumber`
+  (e.g. `arch=v"10.3"`, treated as baseline / forward-compatible). The old kwarg name
+  `cap=` is accepted as a deprecated alias.
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
new file mode 100644
index 0000000000..86f566766a
--- /dev/null
+++ b/CUDACore/src/compiler/sm.jl
@@ -0,0 +1,144 @@
+export SMVersion, @sm_str
+
+"""
+    SMVersion(major, minor, [feature_set])
+    SMVersion(s::AbstractString)
+    SMVersion(v::VersionNumber)
+    SMVersion(sm::SMVersion)
+
+A PTX compilation target, identifying a CUDA compute capability together with the
+subtarget feature set selected by the suffix on its `.target` directive. Printed and
+parsed in NVIDIA's compact form -- `sm"90"` for compute capability 9.0, `sm"103a"`
+for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]`
+notation in the PTX ISA reference and to distinguish visually from a device-level
+`VersionNumber` like `v"9.0"`.
+
+The single-argument constructors normalize various inputs to an `SMVersion`:
+
+- `SMVersion(::AbstractString)` parses the compact form, with or without the `sm_`
+  prefix (so e.g. `SMVersion("sm_103a")` and `SMVersion("103a")` both work).
+- `SMVersion(::VersionNumber)` promotes a plain compute-capability version to a
+  baseline `SMVersion` (`SMVersion(v"10.3") == SMVersion(10, 3, :baseline)`).
+- `SMVersion(::SMVersion)` is the identity (idempotent).
+
+This is what lets `@cuda arch=...` accept `v"10.3"`, `sm"103a"`, `"sm_103a"`, or
+an already-constructed `SMVersion` interchangeably.
+
+`feature_set` is one of:
+
+- `:baseline` (no suffix, e.g. `sm_90`) — forward-compatible (the "onion model"):
+  PTX compiled for `sm_X` runs on any `sm_Y` with `Y >= X`.
+- `:family` (`f` suffix, e.g. `sm_100f`) — same-major-family-portable: PTX runs on
+  any device in the same architecture family (currently == same major version) at
+  or above this CC.
+- `:arch` (`a` suffix, e.g. `sm_90a`) — locked to one exact CC: PTX runs only on
+  devices with exactly this compute capability, but in exchange gets access to
+  architecture-accelerated features.
+
+See NVIDIA's PTX ISA reference under `.target` for the full compatibility rules,
+and `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature
+definitions.
+
+Public fields:
+- `sm.major::Int`
+- `sm.minor::Int`
+- `sm.feature_set::Symbol`
+
+See also [`@sm_str`](@ref) for an ergonomic string-macro constructor.
+
+# Examples
+```julia
+julia> SMVersion(9, 0)            # baseline
+sm"90"
+
+julia> SMVersion(9, 0, :arch)
+sm"90a"
+
+julia> sm"100f" == SMVersion(10, 0, :family)
+true
+```
+"""
+struct SMVersion
+    major::Int
+    minor::Int
+    feature_set::Symbol
+
+    function SMVersion(major::Integer, minor::Integer, feature_set::Symbol = :baseline)
+        feature_set in (:baseline, :family, :arch) ||
+            error("SMVersion feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
+        return new(Int(major), Int(minor), feature_set)
+    end
+end
+
+function Base.parse(::Type{SMVersion}, s::AbstractString)
+    # Mirrors NVIDIA's `sm_NN[a|f]` notation: the last digit before the optional suffix
+    # is the minor, everything before it is the major. Always one minor digit (NVIDIA
+    # has never minted a CC with minor >= 10, and rolls the major over instead). The
+    # optional `sm_` prefix is accepted so PTX-tool output / config strings can pass
+    # straight through.
+    raw = startswith(s, "sm_") ? SubString(s, 4) : s
+    m = match(r"^(\d+)(\d)([af]?)$", raw)
+    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"sm_103a\", or \"100f\"")
+    major = parse(Int, m.captures[1])
+    minor = parse(Int, m.captures[2])
+    fs = m.captures[3] == "a" ? :arch :
+         m.captures[3] == "f" ? :family : :baseline
+    return SMVersion(major, minor, fs)
+end
+
+# Single-argument constructor: the universal normalizer for accepting an `arch`/`cap`-like
+# argument. Identity for SMVersion; baseline-promotes a plain VersionNumber; parses a
+# string (with or without the `sm_` prefix).
+SMVersion(sm::SMVersion) = sm
+SMVersion(v::VersionNumber) = SMVersion(v.major, v.minor, :baseline)
+SMVersion(s::AbstractString) = Base.parse(SMVersion, s)
+
+# Suffix on the LLVM CPU name / `.target` directive
+suffix(sm::SMVersion) = sm.feature_set === :arch    ? "a" :
+                        sm.feature_set === :family  ? "f" : ""
+
+# LLVM CPU / PTX `.target` name (e.g. "sm_103a").
+cpu_name(sm::SMVersion) = "sm_$(sm.major)$(sm.minor)$(suffix(sm))"
+
+# Drop the feature set to recover the base compute-capability `VersionNumber`,
+# usable against the version-keyed compatibility databases.
+base_version(sm::SMVersion) = VersionNumber(sm.major, sm.minor)
+
+# Would a cubin compiled for `sm` actually load and run on a device with capability
+# `dev_cap`? Per NVIDIA's PTX ISA reference (.target directive):
+#   - baseline: forward-compatible (onion model) -- any sm_X runs on sm_Y for Y >= X.
+#   - family:   same architecture family (currently == same major) and forward-portable
+#               within the family.
+#   - arch:     locked to one exact CC; cubin only loads on devices with that exact cap.
+function runs_on(sm::SMVersion, dev_cap::VersionNumber)
+    if sm.feature_set === :arch
+        return base_version(sm) == dev_cap
+    elseif sm.feature_set === :family
+        return sm.major == dev_cap.major && base_version(sm) <= dev_cap
+    else  # :baseline
+        return base_version(sm) <= dev_cap
+    end
+end
+
+
+Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, sm.minor, suffix(sm), "\"")
+
+"""
+    @sm_str
+
+String macro used to parse a string to an [`SMVersion`](@ref). Accepts NVIDIA's
+compact `sm_NN[a|f]` notation (with or without the `sm_` prefix): `sm"90"` for
+baseline, `sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific.
+Equivalent to calling `SMVersion(str)`; parses at macro-expansion time, so the
+resulting `SMVersion` is a compile-time constant in the surrounding expression.
+
+# Examples
+```julia
+julia> sm"103a"
+sm"103a"
+
+julia> sm"100f" == SMVersion(10, 0, :family)
+true
+```
+"""
+macro sm_str(s); SMVersion(s); end
diff --git a/CUDACore/src/device/intrinsics/version.jl b/CUDACore/src/device/intrinsics/version.jl
index 6114833eeb..ea66448bda 100644
--- a/CUDACore/src/device/intrinsics/version.jl
+++ b/CUDACore/src/device/intrinsics/version.jl
@@ -1,8 +1,8 @@
 # device intrinsics for querying the compute SimpleVersion and PTX ISA version
 
-export compute_capability, ptx_isa_version
+export compute_capability, ptx_isa_version, target_feature_set
 
-for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"]
+for var in ["sm_major", "sm_minor", "sm_features", "ptx_major", "ptx_minor"]
     @eval @device_function @inline $(Symbol(var))() =
         Base.llvmcall(
             $("""@$var = external global i32
@@ -17,3 +17,16 @@ end
 @device_function @inline compute_capability() = SimpleVersion(sm_major(), sm_minor())
 @device_function @inline ptx_isa_version() = SimpleVersion(ptx_major(), ptx_minor())
 
+# Feature set encoded in the `.target` directive: one of `:baseline`, `:family`, `:arch`.
+# (NVIDIA's PTX ISA reference: ".target specifies the set of features in the target
+# architecture for which the current PTX code was generated.") GPUCompiler stamps the
+# encoding in via the `sm_features` LLVM global, using `GPUCompiler.TargetFeatureSet`;
+# the integer load + chained compare folds away after LLVM inlines the constant, so
+# user code like `if target_feature_set() === :arch ... end` resolves to a single
+# branch in the PTX output.
+@device_function @inline function target_feature_set()
+    f = sm_features()
+    return f == UInt32(GPUCompiler.ArchFeatures)   ? :arch :
+           f == UInt32(GPUCompiler.FamilyFeatures) ? :family : :baseline
+end
+
diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl
index cd3687f0ea..6739d27d68 100644
--- a/CUDACore/src/device/runtime.jl
+++ b/CUDACore/src/device/runtime.jl
@@ -12,14 +12,16 @@ function precompile_runtime()
     f = ()->return
     mi = methodinstance(typeof(f), Tuple{})
 
-    caps = llvm_compat().cap
+    # `.cap` is now keyed by `SMVersion` and includes variants; runtime caches are
+    # feature_set-agnostic, so we only warm the baseline entries.
+    sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().sm)
     ptx = maximum(llvm_compat().ptx)
     JuliaContext() do ctx
-        for cap in caps, debuginfo in [false, true]
+        for sm in sms, debuginfo in [false, true]
             # NOTE: this often runs when we don't have a functioning set-up,
             #       so we don't use `compiler_config` which requires NVML
-            target = PTXCompilerTarget(; cap, ptx, debuginfo)
-            params = CUDACompilerParams(; cap, ptx)
+            target = PTXCompilerTarget(; cap=base_version(sm), ptx, debuginfo)
+            params = CUDACompilerParams(; sm, ptx)
             config = CompilerConfig(target, params)
             job = CompilerJob(mi, config)
             GPUCompiler.load_runtime(job)
diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl
index 58ae94f536..8817aa74cc 100644
--- a/CUDACore/src/precompile.jl
+++ b/CUDACore/src/precompile.jl
@@ -13,11 +13,16 @@ if :NVPTX in LLVM.backends()
             end
 
             llvm_support = llvm_compat()
-            llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap))
+            # `.sm` is `Set{SMVersion}` (with variants); pick the highest baseline
+            # entry <= v"7.5" for a portable precompile artifact.
+            llvm_sm = argmax(base_version,
+                             filter(sm -> sm.feature_set === :baseline &&
+                                          base_version(sm) <= v"7.5",
+                                    llvm_support.sm))
             llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))
 
-            target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
-            params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx)
+            target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, debuginfo=true)
+            params = CUDACompilerParams(; sm=llvm_sm, ptx=llvm_ptx)
             config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)
 
             tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
diff --git a/CUDATools/src/utilities.jl b/CUDATools/src/utilities.jl
index 079ad71eb3..c14453cb7a 100644
--- a/CUDATools/src/utilities.jl
+++ b/CUDATools/src/utilities.jl
@@ -142,5 +142,27 @@ function versioninfo(io::IO=stdout)
             query_cuda()
         end
         println(io, "  $(i-1): $str (sm_$(cap.major)$(cap.minor), $(Base.format_bytes(mem.free)) / $(Base.format_bytes(mem.total)) available)")
+
+        # report the default compilation target we'd select for this device
+        config = try
+            CUDACore.compiler_config(dev)
+        catch
+            nothing
+        end
+        if config !== nothing
+            ptxas_sm  = config.params.sm
+            ptxas_ptx = config.params.ptx
+            llvm_sm   = CUDACore.SMVersion(config.target.cap.major,
+                                           config.target.cap.minor,
+                                           config.target.feature_set)
+            llvm_ptx  = config.target.ptx
+            ptxas_str = "$(CUDACore.cpu_name(ptxas_sm)) / PTX $(ptxas_ptx.major).$(ptxas_ptx.minor)"
+            if llvm_sm == ptxas_sm && llvm_ptx == ptxas_ptx
+                println(io, "     compiles to $ptxas_str")
+            else
+                llvm_str = "$(CUDACore.cpu_name(llvm_sm)) / PTX $(llvm_ptx.major).$(llvm_ptx.minor)"
+                println(io, "     compiles to $ptxas_str (LLVM: $llvm_str)")
+            end
+        end
     end
 end
diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md
index a7ce178a3e..a2c493d789 100644
--- a/docs/src/api/compiler.md
+++ b/docs/src/api/compiler.md
@@ -25,6 +25,14 @@ registers
 memory
 ```
 
+The PTX compilation target is identified by an `SMVersion`, constructed via the
+`sm"..."` string macro:
+
+```@docs
+SMVersion
+@sm_str
+```
+
 To plug in alternative compiler back-ends (e.g. cuTile.jl), `@cuda` dispatches
 through a small protocol:
 
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
index 6049552626..e347cf8300 100644
--- a/perf/volumerhs.jl
+++ b/perf/volumerhs.jl
@@ -232,11 +232,33 @@ function main()
               - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory,
                 $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
                 $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
+
+    # Run once to validate: the result must be finite and the L1 sum must
+    # match a baked-in reference computed from this same StableRNG(123)
+    # seed. cuTile/perf/volumerhs.jl uses the same reference so the two
+    # implementations can be cross-checked.
+    CUDA.@sync kernel(rhs, Q, vgeo, DFloat(grav), D, nelem;
+                       threads=threads, blocks=nelem)
+    rhs_h = Array(rhs)
+    @assert all(isfinite, rhs_h) "kernel produced non-finite values"
+    rsum = sum(rhs_h)
+    ref  = 1.4227473f10
+    rel  = abs(rsum - ref) / abs(ref)
+    @assert rel < 1f-3 "rhs checksum off by $rel (got $rsum, expected $ref)"
+    @info "validation passed" rhs_sum=rsum reference=ref rel_err=rel
+    fill!(rhs, 0)
+
     results = @benchmark begin
+        # zero rhs each iteration so accumulation stays meaningful
+        fill!($rhs, 0)
         CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
                                          threads=$threads, blocks=$nelem)
     end
 
+    bytes = nelem * 28 * Nq^3 * sizeof(DFloat)
+    bw    = bytes / (minimum(results).time / 1e9) / 1e9
+    @info "SIMT volumerhs! benchmark" min_ms=minimum(results).time/1e6 median_ms=median(results).time/1e6 effective_BW="$(round(Int, bw)) GB/s"
+
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
     CUDA.unsafe_free!(rhs)
     CUDA.unsafe_free!(Q)
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 02f275aa7d..db431db4ed 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -255,10 +255,51 @@ end
 
     @test !success(run_ptxas(asm_pre, "sm_75"))
 
-    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0")
+    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90")
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
+
+    # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
+    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90a")
+    @test occursin(".target sm_90a", asm_arch)
+    @test success(run_ptxas(asm_arch, "sm_90a"))
+
+    # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"100f")
+    @test occursin(".target sm_100f", asm_family)
+    @test success(run_ptxas(asm_family, "sm_100f"))
+end
+
+@testset "SMVersion and sm\"...\" macro" begin
+    @test sm"90"   == SMVersion(9, 0, :baseline)
+    @test sm"90a"  == SMVersion(9, 0, :arch)
+    @test sm"100f" == SMVersion(10, 0, :family)
+    # printing roundtrips via the macro form
+    @test sprint(show, sm"103a") == "sm\"103a\""
+    @test sprint(show, sm"100")  == "sm\"100\""
+    # cpu_name reflects feature_set
+    @test CUDACore.cpu_name(sm"90")   == "sm_90"
+    @test CUDACore.cpu_name(sm"90a")  == "sm_90a"
+    @test CUDACore.cpu_name(sm"100f") == "sm_100f"
+    # base_version drops the suffix back to a comparable VersionNumber
+    @test CUDACore.base_version(sm"103a") == v"10.3"
+    # constructor rejects bogus feature_set
+    @test_throws ErrorException SMVersion(9, 0, :bogus)
+    # macro rejects malformed strings
+    @test_throws ErrorException parse(SMVersion, "10.3a")    # dotted form (NVIDIA uses dotless)
+    @test_throws ErrorException parse(SMVersion, "100x")     # unknown suffix
+    @test_throws ErrorException parse(SMVersion, "1")        # only one digit (need at least major + minor)
+    @test_throws ErrorException parse(SMVersion, "")         # empty
+
+    # `SMVersion(x)` as the universal normalizer:
+    @test SMVersion(sm"103a")          === sm"103a"                        # identity
+    @test SMVersion(v"10.3")           == SMVersion(10, 3, :baseline)      # VersionNumber → baseline
+    @test SMVersion("103a")            == sm"103a"                         # bare string
+    @test SMVersion("sm_103a")         == sm"103a"                         # accepts NVIDIA prefix
+    # the macro is just a parse-time call to the constructor
+    @test sm"103a"                     == SMVersion("103a")
 end
 
 end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index edf589913d..387af3547e 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -50,17 +50,84 @@ end
     @cuda threads=2 dummy()
 
     # sm_10 isn't supported by LLVM
-    @test_throws "not supported by LLVM" @cuda launch=false cap=v"1.0" dummy()
+    @test_throws "not supported by LLVM" @cuda launch=false arch=sm"10" dummy()
     # sm_20 is, but not by any CUDA version we support
-    @test_throws "Failed to compile PTX code" @cuda launch=false cap=v"2.0" dummy()
+    @test_throws "Failed to compile PTX code" @cuda launch=false arch=sm"20" dummy()
     # there isn't any capability other than the device's that's guaruanteed to work
-    @cuda launch=false cap=capability(device()) dummy()
+    dev_cap = capability(device())
+    dev_sm = SMVersion(dev_cap.major, dev_cap.minor)
+    @cuda launch=false arch=dev_sm dummy()
+    # `arch=` also accepts a plain `VersionNumber` -- treated as baseline. Equivalent
+    # to constructing the SMVersion directly.
+    @cuda launch=false arch=dev_cap dummy()
     # but we should be able to see it in the generated PTX code
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0"))
+    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50"))
+    @test contains(asm, ".target sm_50")
+    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0"))
     @test contains(asm, ".target sm_50")
 
+    # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the
+    # `.version` line should match what was asked for, independently of what LLVM and
+    # ptxas would natively pick.
     asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3"))
     @test contains(asm, ".version 6.3")
+
+    # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it
+    # used to be); a clearly out-of-range value must error at config time.
+    @test_throws "not supported" @cuda launch=false ptx=v"99.0" dummy()
+
+    # feature_set is selected by the suffix on the sm"..." string; the suffix should
+    # surface in the .target directive in the PTX output. The cuda-side `.target` is
+    # the variant regardless of LLVM support -- the mcgen rewrite stamps it in even
+    # when LLVM clamped to baseline for codegen.
+    sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch)
+    sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
+
+    if dev_cap >= v"9.0"
+        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a))
+        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+        # arch-specific cubin should also actually launch on the matching device
+        @cuda arch=sm_a dummy()
+    end
+    if dev_cap >= v"10.0"
+        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f))
+        @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))")
+        @cuda arch=sm_f dummy()
+    end
+
+    # `cap=` is the deprecated alias for `arch=`; check the depwarn fires while
+    # the path still produces the right PTX.
+    @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50"))
+
+    # With no explicit `arch=`, we default to architecture-specific code paths on CC >=9.0
+    # since we know the exact device. The cuda-side `.target` is the variant regardless of
+    # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs.
+    if dev_cap >= v"9.0"
+        asm = sprint(io->CUDA.code_ptx(io, dummy, ()))
+        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+    end
+
+    # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built
+    # for (not the cuda-side .target): when LLVM doesn't natively support the exact variant,
+    # we fall back to baseline LLVM, so the global reflects baseline. The if-chain folds at
+    # codegen time, so the launched kernel writes a single constant.
+    function read_feature_set!(out)
+        @inbounds out[1] = if target_feature_set() === :arch
+            UInt32(2)
+        elseif target_feature_set() === :family
+            UInt32(1)
+        else
+            UInt32(0)
+        end
+        return
+    end
+    out = CuArray{UInt32}([typemax(UInt32)])
+    @cuda threads=1 read_feature_set!(out)
+    # arch features come through `target_feature_set()` only when LLVM natively supported
+    # the variant; otherwise we fell back to baseline LLVM and the global reflects that.
+    arch_in_llvm = sm_a in CUDACore.llvm_sm_support(CUDACore.LLVM.version())
+    expected = dev_cap >= v"9.0" && arch_in_llvm ? UInt32(2) : UInt32(0)
+    @test Array(out)[1] == expected
 end