From 8d65f63b0301db1fd398e593261fad401bf62123 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 09:54:32 +0000
Subject: [PATCH 01/16] Add architecture- and family-specific PTX target
 support

---
 CUDACore/src/compatibility.jl        | 56 +++++++++++++++++-----------
 CUDACore/src/compiler/compilation.jl | 38 +++++++++++++++----
 test/core/codegen.jl                 | 32 ++++++++++++++++
 3 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index afb833b32a..c4cbf177a3 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -1,14 +1,43 @@
 # compatibility of Julia, CUDA and LLVM
 
-# NOTE: Target architectures with suffix “a”, such as sm_90a, include
-# architecture-accelerated features that are supported on the specified architecture only,
-# hence such targets do not follow the onion layer model. Therefore, PTX code generated for
-# such targets cannot be run on later generation devices. Architecture-accelerated features
-# can only be used with targets that support these features.
+# PTX compilation targets come in three feature-set flavors, selected via the suffix on the
+# `.target` directive (and the matching `--gpu-name` to ptxas):
+#
+#   - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
+#     for sm_X runs on any sm_Y with Y >= X (onion model).
+#   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
+#     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
+#     Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
+#   - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one
+#     exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with
+#     CC 9.0; uses the same PTX requirement as the plain target.
+#
+# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma,
+# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across
+# devices in the same major family; baseline unlocks only the forward-portable set.
 
 const lowest = v"0"
 const highest = v"999"
 
+# PTX compilation target feature set; see top-of-file note for the hierarchy and rules.
+@enum PTXTargetKind Baseline Family Architectural
+
+# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the
+# kind PTX floors are uniform across caps, so we encode them here rather than in the
+# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax
+# was introduced in PTX 8.0; the `f` syntax in PTX 8.8.
+function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind)
+    if kind === Architectural
+        cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap")
+        ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx")
+    end
+    if kind === Family
+        cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap")
+        ptx >= v"8.8"  || error("Family targets require PTX ISA >= 8.8; got $ptx")
+    end
+    return
+end
+
 
 ## version range
 
@@ -163,22 +192,11 @@ const ptx_cap_db = Dict(
     v"8.7"   => between(v"7.4", highest),
     v"8.9"   => between(v"7.8", highest),
     v"9.0"   => between(v"7.8", highest),
-    #v"9.0a" => between(v"8.0", highest)
     v"10.0"  => between(v"8.6", highest),
-    #v"10.0a"=> between(v"8.6", highest),
-    #v"10.0f"=> between(v"8.8", highest),
     v"10.1"  => between(v"8.6", highest),
-    #v"10.1a"=> between(v"8.6", highest),
-    #v"10.1f"=> between(v"8.8", highest),
     v"10.3"  => between(v"8.8", highest),
-    #v"10.3a"=> between(v"8.8", highest),
-    #v"10.3f"=> between(v"8.8", highest),
     v"12.0"  => between(v"8.7", highest),
-    #v"12.0a"=> between(v"8.7", highest),
-    #v"12.0f"=> between(v"8.8", highest),
     v"12.1"  => between(v"8.8", highest),
-    #v"12.1a"=> between(v"8.8", highest),
-    #v"12.1f"=> between(v"8.8", highest),
 )
 
 function ptx_cap_support(ver::VersionNumber)
@@ -216,17 +234,11 @@ const llvm_cap_db = Dict(
     v"8.7"   => between(v"16", highest),
     v"8.9"   => between(v"16", highest),
     v"9.0"   => between(v"16", highest),
-    #v"9.0a" => between(v"18", highest),
     v"10.0"  => between(v"20", highest),
-    #v"10.0a"=> between(v"20", highest),
     v"10.1"  => between(v"20", highest),
-    #v"10.1a"=> between(v"20", highest),
     v"10.3"  => between(v"21", highest),
-    #v"10.3a"=> between(v"21", highest),
     v"12.0"  => between(v"20", highest),
-    #v"12.0a"=> between(v"20", highest),
     v"12.1"  => between(v"21", highest),
-    #v"12.1a"=> between(v"21", highest),
 )
 
 function llvm_cap_support(ver::VersionNumber)
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 1b60e42602..b3a5e2fc30 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -3,15 +3,27 @@
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
     cap::VersionNumber
     ptx::VersionNumber
+    kind::PTXTargetKind = Baseline
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
     h = hash(params.cap, h)
     h = hash(params.ptx, h)
+    h = hash(params.kind, h)
 
     return h
 end
 
+# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural`
+# (ptxas requires exact match) and need to be in the same major family for `kind=Family`;
+# emitting the same string on both sides handles all three kinds correctly.
+function format_target(cap::VersionNumber, kind::PTXTargetKind)
+    suffix = kind === Architectural ? "a" :
+             kind === Family        ? "f" : ""
+    return "sm_$(cap.major)$(cap.minor)$suffix"
+end
+
 const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams}
 const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams}
 
@@ -119,10 +131,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap)
+function rewrite_ptx_header(asm, ptx, cap, kind=Baseline)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
+        r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -145,9 +157,9 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap) = job.config.params
-    if job.config.target.ptx != ptx || job.config.target.cap != cap
-        asm = rewrite_ptx_header(asm, ptx, cap)
+    (; ptx, cap, kind) = job.config.params
+    if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline
+        asm = rewrite_ptx_header(asm, ptx, cap, kind)
     end
 
     return asm
@@ -179,7 +191,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, kind=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -236,9 +248,18 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = compiler_version() >= v"11.7"
 
+    # default the target feature set based on the device cap. Architectural is the
+    # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict
+    # superset of Baseline, and the cubin is per-device anyway so portability isn't on
+    # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline.
+    if kind === nothing
+        kind = cuda_cap >= v"9.0" ? Architectural : Baseline
+    end
+    validate_target_kind(cuda_cap, cuda_ptx, kind)
+
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx)
+    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -275,7 +296,8 @@ function compile(@nospecialize(job::CompilerJob))
 
     ptx = job.config.params.ptx
     cap = job.config.params.cap
-    arch = "sm_$(cap.major)$(cap.minor)"
+    kind = job.config.params.kind
+    arch = format_target(cap, kind)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 02f275aa7d..f813197b46 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -259,6 +259,38 @@ end
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
+
+    # Architectural kind appends an `a` suffix to the .target directive (and the same
+    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match
+    # for `a`-mode).
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural)
+    @test occursin(".target sm_90a", asm_arch)
+    @test success(run_ptxas(asm_arch, "sm_90a"))
+
+    # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line.
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family)
+    @test occursin(".target sm_100f", asm_family)
+    @test success(run_ptxas(asm_family, "sm_100f"))
+end
+
+@testset "CUDACompilerParams hash discriminates on kind" begin
+    # Without `kind` in the hash, two params differing only on kind would collide in
+    # the compiler cache and silently return a cubin compiled for the wrong feature set.
+    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural)
+    @test hash(base) != hash(arch)
+    @test base != arch
+end
+
+@testset "validate_target_kind" begin
+    # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8.
+    @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural)
+    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural)
+    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family)
+    @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family)
+    @test CUDACore.validate_target_kind(v"9.0",  v"8.0", CUDACore.Architectural) === nothing
+    @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing
+    @test CUDACore.validate_target_kind(v"5.0",  v"6.2", CUDACore.Baseline) === nothing
 end
 
 end

From 9f254a296d29b505fdd6fe71cd84247ef44e2524 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 10:24:39 +0000
Subject: [PATCH 02/16] Remove `kind` compiler config kwarg

---
 CUDACore/src/compiler/compilation.jl | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index b3a5e2fc30..cd6e9d4b02 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -191,7 +191,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kind=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -248,14 +248,11 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = compiler_version() >= v"11.7"
 
-    # default the target feature set based on the device cap. Architectural is the
+    # pick the target feature set based on the device cap. Architectural is the
     # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict
     # superset of Baseline, and the cubin is per-device anyway so portability isn't on
     # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline.
-    if kind === nothing
-        kind = cuda_cap >= v"9.0" ? Architectural : Baseline
-    end
-    validate_target_kind(cuda_cap, cuda_ptx, kind)
+    kind = cuda_cap >= v"9.0" ? Architectural : Baseline
 
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)

From 440708d515ede17ae77d7a2cf7f5d51f750b6259 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 19:28:09 +0000
Subject: [PATCH 03/16] Add `feature_set` compiler config kwarg; use symbols
 instead of enum type

---
 CUDACore/src/compatibility.jl        | 38 +++++++-----------
 CUDACore/src/compiler/compilation.jl | 58 ++++++++++++++++------------
 CUDACore/src/compiler/execution.jl   |  4 +-
 test/core/codegen.jl                 | 42 ++++++++++----------
 4 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index c4cbf177a3..4e58578691 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -1,5 +1,9 @@
 # compatibility of Julia, CUDA and LLVM
 
+const lowest = v"0"
+const highest = v"999"
+
+
 # PTX compilation targets come in three feature-set flavors, selected via the suffix on the
 # `.target` directive (and the matching `--gpu-name` to ptxas):
 #
@@ -8,32 +12,20 @@
 #   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
 #     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
 #     Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
-#   - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one
-#     exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with
+#   - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
+#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with
 #     CC 9.0; uses the same PTX requirement as the plain target.
 #
-# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma,
-# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across
-# devices in the same major family; baseline unlocks only the forward-portable set.
-
-const lowest = v"0"
-const highest = v"999"
-
-# PTX compilation target feature set; see top-of-file note for the hierarchy and rules.
-@enum PTXTargetKind Baseline Family Architectural
-
-# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the
-# kind PTX floors are uniform across caps, so we encode them here rather than in the
-# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax
-# was introduced in PTX 8.0; the `f` syntax in PTX 8.8.
-function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind)
-    if kind === Architectural
-        cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap")
-        ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx")
+function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
+    if !(feature_set in (:baseline, :family, :architecture))
+        error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))")
     end
-    if kind === Family
-        cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap")
-        ptx >= v"8.8"  || error("Family targets require PTX ISA >= 8.8; got $ptx")
+    if feature_set === :architecture
+        cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
+        ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
+    elseif feature_set === :family
+        cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap")
+        ptx >= v"8.8"  || error("Family-specific targets require PTX ISA >= 8.8; got $ptx")
     end
     return
 end
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index cd6e9d4b02..2e21e3b603 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -3,24 +3,24 @@
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
     cap::VersionNumber
     ptx::VersionNumber
-    kind::PTXTargetKind = Baseline
+    feature_set::Symbol = :baseline
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
     h = hash(params.cap, h)
     h = hash(params.ptx, h)
-    h = hash(params.kind, h)
+    h = hash(params.feature_set, h)
 
     return h
 end
 
-# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
-# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural`
-# (ptxas requires exact match) and need to be in the same major family for `kind=Family`;
-# emitting the same string on both sides handles all three kinds correctly.
-function format_target(cap::VersionNumber, kind::PTXTargetKind)
-    suffix = kind === Architectural ? "a" :
-             kind === Family        ? "f" : ""
+# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture`
+# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
+# emitting the same string on both sides handles all three feature sets correctly.
+function format_target(cap::VersionNumber, feature_set::Symbol)
+    suffix = feature_set === :architecture ? "a" :
+             feature_set === :family       ? "f" : ""
     return "sm_$(cap.major)$(cap.minor)$suffix"
 end
 
@@ -131,10 +131,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap, kind=Baseline)
+function rewrite_ptx_header(asm, ptx, cap, feature_set)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))")
+        r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -157,9 +157,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap, kind) = job.config.params
-    if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline
-        asm = rewrite_ptx_header(asm, ptx, cap, kind)
+    (; ptx, cap, feature_set) = job.config.params
+    needs_rewrite = job.config.target.ptx != ptx ||
+                    job.config.target.cap != cap ||
+                    feature_set !== :baseline
+    if needs_rewrite
+        asm = rewrite_ptx_header(asm, ptx, cap, feature_set)
     end
 
     return asm
@@ -191,7 +194,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, feature_set=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -248,15 +251,22 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = compiler_version() >= v"11.7"
 
-    # pick the target feature set based on the device cap. Architectural is the
-    # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict
-    # superset of Baseline, and the cubin is per-device anyway so portability isn't on
-    # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline.
-    kind = cuda_cap >= v"9.0" ? Architectural : Baseline
+    # Pick the target feature set based on the device cap.
+    # Architecture-specific is chosen for devices where it's
+    # available (CC >= 9.0) since it's a strict superset of
+    # the baseline and family feature sets.
+    if feature_set === nothing
+        feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0"
+            :architecture
+        else
+            :baseline
+        end
+    end
+    validate_feature_set(cuda_cap, cuda_ptx, feature_set)
 
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind)
+    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -291,10 +301,8 @@ function compile(@nospecialize(job::CompilerJob))
         push!(ptxas_opts, "--compile-only")
     end
 
-    ptx = job.config.params.ptx
-    cap = job.config.params.cap
-    kind = job.config.params.kind
-    arch = format_target(cap, kind)
+    (; ptx, cap, feature_set) = job.config.params
+    arch = format_target(cap, feature_set)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index 130d049e7c..7cfe96e1a9 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
 
@@ -434,6 +434,8 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
+- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the
+  most specific supported by the device
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index f813197b46..48af0d92f5 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -255,42 +255,42 @@ end
 
     @test !success(run_ptxas(asm_pre, "sm_75"))
 
-    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0")
+    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline)
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
 
-    # Architectural kind appends an `a` suffix to the .target directive (and the same
-    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match
-    # for `a`-mode).
-    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural)
+    # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
+    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture)
     @test occursin(".target sm_90a", asm_arch)
     @test success(run_ptxas(asm_arch, "sm_90a"))
 
-    # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line.
-    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family)
+    # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family)
     @test occursin(".target sm_100f", asm_family)
     @test success(run_ptxas(asm_family, "sm_100f"))
 end
 
-@testset "CUDACompilerParams hash discriminates on kind" begin
-    # Without `kind` in the hash, two params differing only on kind would collide in
-    # the compiler cache and silently return a cubin compiled for the wrong feature set.
-    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline)
-    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural)
+@testset "CUDACompilerParams hash discriminates on feature_set" begin
+    # Without feature_set in the hash, two params differing only on feature_set would collide
+    # in the compiler cache and silently return a cubin compiled for the wrong feature set.
+    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture)
     @test hash(base) != hash(arch)
     @test base != arch
 end
 
-@testset "validate_target_kind" begin
-    # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8.
-    @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural)
-    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural)
-    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family)
-    @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family)
-    @test CUDACore.validate_target_kind(v"9.0",  v"8.0", CUDACore.Architectural) === nothing
-    @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing
-    @test CUDACore.validate_target_kind(v"5.0",  v"6.2", CUDACore.Baseline) === nothing
+@testset "validate_feature_set" begin
+    # Architecture-specific needs CC >= 9.0 and PTX >= 8.0
+    # Family-specific needs CC >= 10.0 and PTX >= 8.8.
+    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
+    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :architecture) === nothing
+    @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
+    @test CUDACore.validate_feature_set(v"5.0",  v"6.2", :baseline) === nothing
 end
 
 end

From 74e8ddd24d5ec7291b3d6d0556f6d4b338e4849b Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 19:59:39 +0000
Subject: [PATCH 04/16] Require opt-in through `feature_set` to avoid breaking
 changes

---
 CUDACore/src/compiler/compilation.jl | 14 +++-----------
 CUDACore/src/compiler/execution.jl   |  3 +--
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 2e21e3b603..6b859957eb 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -251,17 +251,9 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = compiler_version() >= v"11.7"
 
-    # Pick the target feature set based on the device cap.
-    # Architecture-specific is chosen for devices where it's
-    # available (CC >= 9.0) since it's a strict superset of
-    # the baseline and family feature sets.
-    if feature_set === nothing
-        feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0"
-            :architecture
-        else
-            :baseline
-        end
-    end
+    # Conservatively pick baseline for backward compatibility,
+    # requiring explicit opt-in for family- and architecture-specific instructions. 
+    feature_set = something(feature_set, :baseline)
     validate_feature_set(cuda_cap, cuda_ptx, feature_set)
 
     # create GPUCompiler objects
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index 7cfe96e1a9..e5359e50fc 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -434,8 +434,7 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
-- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the
-  most specific supported by the device
+- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture`
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when

From 1faeb7eb6f3b77cd310452b50b73b9ebbba27ebe Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Fri, 1 May 2026 15:41:37 +0000
Subject: [PATCH 05/16] Shorten `:architecture` to `:arch`

---
 CUDACore/src/compatibility.jl        |  6 +++---
 CUDACore/src/compiler/compilation.jl |  4 ++--
 CUDACore/src/compiler/execution.jl   |  2 +-
 test/core/codegen.jl                 | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index 4e58578691..08506a37af 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -17,10 +17,10 @@ const highest = v"999"
 #     CC 9.0; uses the same PTX requirement as the plain target.
 #
 function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
-    if !(feature_set in (:baseline, :family, :architecture))
-        error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))")
+    if !(feature_set in (:baseline, :family, :arch))
+        error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
     end
-    if feature_set === :architecture
+    if feature_set === :arch
         cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
         ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
     elseif feature_set === :family
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 6b859957eb..736e7ef23e 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -15,11 +15,11 @@ function Base.hash(params::CUDACompilerParams, h::UInt)
 end
 
 # Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
-# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch`
 # (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
 # emitting the same string on both sides handles all three feature sets correctly.
 function format_target(cap::VersionNumber, feature_set::Symbol)
-    suffix = feature_set === :architecture ? "a" :
+    suffix = feature_set === :arch ? "a" :
              feature_set === :family       ? "f" : ""
     return "sm_$(cap.major)$(cap.minor)$suffix"
 end
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index e5359e50fc..619acaa327 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -434,7 +434,7 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
-- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture`
+- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch`
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 48af0d92f5..2096439b6b 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -262,7 +262,7 @@ end
 
     # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
     # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
-    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture)
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch)
     @test occursin(".target sm_90a", asm_arch)
     @test success(run_ptxas(asm_arch, "sm_90a"))
 
@@ -276,7 +276,7 @@ end
     # Without feature_set in the hash, two params differing only on feature_set would collide
     # in the compiler cache and silently return a cubin compiled for the wrong feature set.
     base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
-    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch)
     @test hash(base) != hash(arch)
     @test base != arch
 end
@@ -284,11 +284,11 @@ end
 @testset "validate_feature_set" begin
     # Architecture-specific needs CC >= 9.0 and PTX >= 8.0
     # Family-specific needs CC >= 10.0 and PTX >= 8.8.
-    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture)
-    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch)
     @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
     @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
-    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :architecture) === nothing
+    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :arch) === nothing
     @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
     @test CUDACore.validate_feature_set(v"5.0",  v"6.2", :baseline) === nothing
 end

From fbfb253f5e45e9028b53dc039dfa3557f61dcd59 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 08:38:06 +0200
Subject: [PATCH 06/16] Unify version databases again using a sm string.

---
 CUDACore/src/CUDACore.jl             |   1 +
 CUDACore/src/compatibility.jl        | 205 ++++++++++++++++-----------
 CUDACore/src/compiler/compilation.jl |  96 ++++++++-----
 CUDACore/src/compiler/execution.jl   |   9 +-
 CUDACore/src/compiler/sm.jl          |  56 ++++++++
 CUDACore/src/device/runtime.jl       |   2 +-
 CUDACore/src/precompile.jl           |   2 +-
 perf/volumerhs.jl                    |  22 +++
 test/core/codegen.jl                 |  62 ++++++--
 test/core/execution.jl               |  31 +++-
 10 files changed, 346 insertions(+), 140 deletions(-)
 create mode 100644 CUDACore/src/compiler/sm.jl

diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl
index fa2bfa2b45..c0332759ff 100644
--- a/CUDACore/src/CUDACore.jl
+++ b/CUDACore/src/CUDACore.jl
@@ -80,6 +80,7 @@ include("../lib/cudadrv/CUDAdrv.jl")
 
 # essential stuff
 include("initialization.jl")
+include("compiler/sm.jl")
 include("compatibility.jl")
 include("debug.jl")
 
diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index 08506a37af..770d717834 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -4,31 +4,20 @@ const lowest = v"0"
 const highest = v"999"
 
 
-# PTX compilation targets come in three feature-set flavors, selected via the suffix on the
-# `.target` directive (and the matching `--gpu-name` to ptxas):
+# PTX compilation targets come in three feature-set flavors (carried on `SMVersion`),
+# selected via the suffix on the `.target` directive (and the matching `--gpu-name`
+# to ptxas):
 #
 #   - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
 #     for sm_X runs on any sm_Y with Y >= X (onion model).
 #   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
 #     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
-#     Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
 #   - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
-#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with
-#     CC 9.0; uses the same PTX requirement as the plain target.
+#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
 #
-function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
-    if !(feature_set in (:baseline, :family, :arch))
-        error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
-    end
-    if feature_set === :arch
-        cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
-        ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
-    elseif feature_set === :family
-        cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap")
-        ptx >= v"8.8"  || error("Family-specific targets require PTX ISA >= 8.8; got $ptx")
-    end
-    return
-end
+# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
+# require for them, is encoded directly in the keys of `ptx_cap_db` and `llvm_cap_db`
+# below: an unsupported combination simply has no entry.
 
 
 ## version range
@@ -159,41 +148,54 @@ end
 
 ## devices supported by each PTX ISA
 
-# Source: PTX ISA document, Release History table
-const ptx_cap_db = Dict(
-    v"1.0"   => between(v"1.0", highest),
-    v"1.1"   => between(v"1.0", highest),
-    v"1.2"   => between(v"1.2", highest),
-    v"1.3"   => between(v"1.2", highest),
-    v"2.0"   => between(v"2.0", highest),
-    v"3.0"   => between(v"3.1", highest),
-    v"3.2"   => between(v"4.0", highest),
-    v"3.5"   => between(v"3.1", highest),
-    v"3.7"   => between(v"4.1", highest),
-    v"5.0"   => between(v"4.0", highest),
-    v"5.2"   => between(v"4.1", highest),
-    v"5.3"   => between(v"4.2", highest),
-    v"6.0"   => between(v"5.0", highest),
-    v"6.1"   => between(v"5.0", highest),
-    v"6.2"   => between(v"5.0", highest),
-    v"7.0"   => between(v"6.0", highest),
-    v"7.2"   => between(v"6.1", highest),
-    v"7.5"   => between(v"6.3", highest),
-    v"8.0"   => between(v"7.0", highest),
-    v"8.6"   => between(v"7.1", highest),
-    v"8.7"   => between(v"7.4", highest),
-    v"8.9"   => between(v"7.8", highest),
-    v"9.0"   => between(v"7.8", highest),
-    v"10.0"  => between(v"8.6", highest),
-    v"10.1"  => between(v"8.6", highest),
-    v"10.3"  => between(v"8.8", highest),
-    v"12.0"  => between(v"8.7", highest),
-    v"12.1"  => between(v"8.8", highest),
+# Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants
+# were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8.
+const ptx_cap_db = Dict{SMVersion, VersionRange}(
+    sm"1.0"   => between(v"1.0", highest),
+    sm"1.1"   => between(v"1.0", highest),
+    sm"1.2"   => between(v"1.2", highest),
+    sm"1.3"   => between(v"1.2", highest),
+    sm"2.0"   => between(v"2.0", highest),
+    sm"3.0"   => between(v"3.1", highest),
+    sm"3.2"   => between(v"4.0", highest),
+    sm"3.5"   => between(v"3.1", highest),
+    sm"3.7"   => between(v"4.1", highest),
+    sm"5.0"   => between(v"4.0", highest),
+    sm"5.2"   => between(v"4.1", highest),
+    sm"5.3"   => between(v"4.2", highest),
+    sm"6.0"   => between(v"5.0", highest),
+    sm"6.1"   => between(v"5.0", highest),
+    sm"6.2"   => between(v"5.0", highest),
+    sm"7.0"   => between(v"6.0", highest),
+    sm"7.2"   => between(v"6.1", highest),
+    sm"7.5"   => between(v"6.3", highest),
+    sm"8.0"   => between(v"7.0", highest),
+    sm"8.6"   => between(v"7.1", highest),
+    sm"8.7"   => between(v"7.4", highest),
+    sm"8.9"   => between(v"7.8", highest),
+    sm"9.0"   => between(v"7.8", highest),
+    sm"9.0a"  => between(v"8.0", highest),
+    sm"10.0"  => between(v"8.6", highest),
+    sm"10.0a" => between(v"8.6", highest),
+    sm"10.0f" => between(v"8.8", highest),
+    sm"10.1"  => between(v"8.6", highest),
+    sm"10.1a" => between(v"8.6", highest),
+    sm"10.1f" => between(v"8.8", highest),
+    sm"10.3"  => between(v"8.8", highest),
+    sm"10.3a" => between(v"8.8", highest),
+    sm"10.3f" => between(v"8.8", highest),
+    sm"12.0"  => between(v"8.7", highest),
+    sm"12.0a" => between(v"8.7", highest),
+    sm"12.0f" => between(v"8.8", highest),
+    sm"12.1"  => between(v"8.8", highest),
+    sm"12.1a" => between(v"8.8", highest),
+    sm"12.1f" => between(v"8.8", highest),
 )
 
+# Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`.
 function ptx_cap_support(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap,r) in ptx_cap_db
+    caps = Set{SMVersion}()
+    for (cap, r) in ptx_cap_db
         if ver in r
             push!(caps, cap)
         end
@@ -201,41 +203,66 @@ function ptx_cap_support(ver::VersionNumber)
     return caps
 end
 
+# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic.
+function ptx_baseline_caps(ver::VersionNumber)
+    caps = Set{VersionNumber}()
+    for (cap, r) in ptx_cap_db
+        if cap.feature_set === :baseline && ver in r
+            push!(caps, base_version(cap))
+        end
+    end
+    return caps
+end
+
 
 ## devices supported by the LLVM NVPTX back-end
 
-# Source: LLVM/lib/Target/NVPTX/NVPTX.td
-const llvm_cap_db = Dict(
-    v"2.0"   => between(v"3.2", highest),
-    v"2.1"   => between(v"3.2", highest),
-    v"3.0"   => between(v"3.2", highest),
-    v"3.2"   => between(v"3.7", highest),
-    v"3.5"   => between(v"3.2", highest),
-    v"3.7"   => between(v"3.7", highest),
-    v"5.0"   => between(v"3.5", highest),
-    v"5.2"   => between(v"3.7", highest),
-    v"5.3"   => between(v"3.7", highest),
-    v"6.0"   => between(v"3.9", highest),
-    v"6.1"   => between(v"3.9", highest),
-    v"6.2"   => between(v"3.9", highest),
-    v"7.0"   => between(v"6", highest),
-    v"7.2"   => between(v"7", highest),
-    v"7.5"   => between(v"8", highest),
-    v"8.0"   => between(v"11", highest),
-    v"8.6"   => between(v"13", highest),
-    v"8.7"   => between(v"16", highest),
-    v"8.9"   => between(v"16", highest),
-    v"9.0"   => between(v"16", highest),
-    v"10.0"  => between(v"20", highest),
-    v"10.1"  => between(v"20", highest),
-    v"10.3"  => between(v"21", highest),
-    v"12.0"  => between(v"20", highest),
-    v"12.1"  => between(v"21", highest),
+# Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up
+# here as a separate entry; without an entry LLVM does not know the variant CPU name and
+# constructing a TargetMachine with it would fall back to a generic subtarget.
+const llvm_cap_db = Dict{SMVersion, VersionRange}(
+    sm"2.0"   => between(v"3.2", highest),
+    sm"2.1"   => between(v"3.2", highest),
+    sm"3.0"   => between(v"3.2", highest),
+    sm"3.2"   => between(v"3.7", highest),
+    sm"3.5"   => between(v"3.2", highest),
+    sm"3.7"   => between(v"3.7", highest),
+    sm"5.0"   => between(v"3.5", highest),
+    sm"5.2"   => between(v"3.7", highest),
+    sm"5.3"   => between(v"3.7", highest),
+    sm"6.0"   => between(v"3.9", highest),
+    sm"6.1"   => between(v"3.9", highest),
+    sm"6.2"   => between(v"3.9", highest),
+    sm"7.0"   => between(v"6", highest),
+    sm"7.2"   => between(v"7", highest),
+    sm"7.5"   => between(v"8", highest),
+    sm"8.0"   => between(v"11", highest),
+    sm"8.6"   => between(v"13", highest),
+    sm"8.7"   => between(v"16", highest),
+    sm"8.9"   => between(v"16", highest),
+    sm"9.0"   => between(v"16", highest),
+    sm"9.0a"  => between(v"18", highest),
+    sm"10.0"  => between(v"20", highest),
+    sm"10.0a" => between(v"20", highest),
+    sm"10.0f" => between(v"21", highest),
+    sm"10.1"  => between(v"20", highest),
+    sm"10.1a" => between(v"20", highest),
+    sm"10.1f" => between(v"21", highest),
+    sm"10.3"  => between(v"21", highest),
+    sm"10.3a" => between(v"21", highest),
+    sm"10.3f" => between(v"21", highest),
+    sm"12.0"  => between(v"20", highest),
+    sm"12.0a" => between(v"20", highest),
+    sm"12.0f" => between(v"21", highest),
+    sm"12.1"  => between(v"21", highest),
+    sm"12.1a" => between(v"21", highest),
+    sm"12.1f" => between(v"21", highest),
 )
 
+# Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`.
 function llvm_cap_support(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap,r) in llvm_cap_db
+    caps = Set{SMVersion}()
+    for (cap, r) in llvm_cap_db
         if ver in r
             push!(caps, cap)
         end
@@ -243,6 +270,17 @@ function llvm_cap_support(ver::VersionNumber)
     return caps
 end
 
+# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic.
+function llvm_baseline_caps(ver::VersionNumber)
+    caps = Set{VersionNumber}()
+    for (cap, r) in llvm_cap_db
+        if cap.feature_set === :baseline && ver in r
+            push!(caps, base_version(cap))
+        end
+    end
+    return caps
+end
+
 
 ## PTX ISAs supported by the LVM NVPTX back-end
 
@@ -299,7 +337,10 @@ end
 function llvm_compat(version=LLVM.version())
     LLVM.InitializeNVPTXTarget()
 
-    cap_support = sort(collect(llvm_cap_support(version)))
+    # the `.cap` field is used for the base-cap clamp in `_compiler_config`, so only
+    # baseline entries are surfaced here. Variant support is queried point-wise via
+    # `sm in llvm_cap_support(...)`.
+    cap_support = sort(collect(llvm_baseline_caps(version)))
     ptx_support = sort(collect(llvm_ptx_support(version)))
 
     return (cap=cap_support, ptx=ptx_support)
@@ -326,5 +367,7 @@ function cuda_compat(runtime=runtime_version(), compiler=compiler_version())
 end
 
 function ptx_compat(ptx)
-    return (cap=ptx_cap_support(ptx),)
+    # Baseline view for the clamp; variant support is queried point-wise via
+    # `sm in ptx_cap_support(...)`.
+    return (cap=ptx_baseline_caps(ptx),)
 end
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 736e7ef23e..9486603dc5 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -1,29 +1,17 @@
 ## gpucompiler interface implementation
 
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
-    cap::VersionNumber
+    sm::SMVersion
     ptx::VersionNumber
-    feature_set::Symbol = :baseline
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
-    h = hash(params.cap, h)
+    h = hash(params.sm, h)
     h = hash(params.ptx, h)
-    h = hash(params.feature_set, h)
 
     return h
 end
 
-# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
-# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch`
-# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
-# emitting the same string on both sides handles all three feature sets correctly.
-function format_target(cap::VersionNumber, feature_set::Symbol)
-    suffix = feature_set === :arch ? "a" :
-             feature_set === :family       ? "f" : ""
-    return "sm_$(cap.major)$(cap.minor)$suffix"
-end
-
 const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams}
 const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams}
 
@@ -131,10 +119,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap, feature_set)
+function rewrite_ptx_header(asm, ptx::VersionNumber, sm::SMVersion)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))")
+        r"\.target sm_\d+\w*" => ".target $(cpu_name(sm))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -157,12 +145,16 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap, feature_set) = job.config.params
+    # The rewrite stamps `.target`/`.version` with the *requested* (cuda-side) values.
+    # When the GPUCompiler-side target matches, LLVM already emits the right header
+    # (including the `a`/`f` suffix, via the CPU name); we only rewrite when they differ,
+    # e.g. when we had to clamp the target down for LLVM compatibility.
+    (; ptx, sm) = job.config.params
     needs_rewrite = job.config.target.ptx != ptx ||
-                    job.config.target.cap != cap ||
-                    feature_set !== :baseline
+                    job.config.target.cap != base_version(sm) ||
+                    job.config.target.feature_set !== sm.feature_set
     if needs_rewrite
-        asm = rewrite_ptx_header(asm, ptx, cap, feature_set)
+        asm = rewrite_ptx_header(asm, ptx, sm)
     end
 
     return asm
@@ -194,7 +186,14 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, feature_set=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, kwargs...)
+    # convert / deprecate VersionNumber cap
+    if cap isa VersionNumber
+        Base.depwarn("Passing a VersionNumber to `cap=` is deprecated; use the `sm\"$(cap.major).$(cap.minor)\"` string macro instead.",
+                     :cufunction)
+        cap = SMVersion(cap.major, cap.minor, :baseline)
+    end
+
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -225,9 +224,12 @@ end
 
     # determine the compute capabilities to use. this should match the capability of the
     # current device, but if LLVM doesn't support it, we can target an older capability
-    # and pass a different `-arch` to `ptxas`.
+    # and pass a different `-arch` to `ptxas`. The feature_set rides on `cap::SMVersion`;
+    # the clamp logic below operates on the base version, with the feature_set carried
+    # through unchanged.
     ptx_support = ptx_compat(cuda_ptx)
-    requested_cap = @something(cap, min(capability(dev), maximum(ptx_support.cap)))
+    base_cap = cap === nothing ? nothing : base_version(cap)
+    requested_cap = @something(base_cap, min(capability(dev), maximum(ptx_support.cap)))
     llvm_caps = filter(<=(requested_cap), llvm_support.cap)
     if cap !== nothing
         ## use the highest capability supported by LLVM
@@ -235,30 +237,53 @@ end
             error("Requested compute capability $cap is not supported by LLVM $(LLVM.version())")
         llvm_cap = maximum(llvm_caps)
         ## use the capability as-is to invoke CUDA
-        cuda_cap = cap
+        cuda_sm = cap
     else
         ## use the highest capability supported by LLVM
         isempty(llvm_caps) &&
             error("Compute capability $(requested_cap) is not supported by LLVM $(LLVM.version())")
         llvm_cap = maximum(llvm_caps)
-        ## use the highest capability supported by CUDA
+        ## use the highest capability supported by CUDA -- always baseline when defaulted
+        ## from the device (the device cap has no feature_set)
         cuda_caps = filter(<=(capability(dev)), cuda_support.cap)
         isempty(cuda_caps) &&
             error("Compute capability $(requested_cap) is not supported by CUDA $(runtime_version())")
-        cuda_cap = maximum(cuda_caps)
+        cuda_sm = SMVersion(maximum(cuda_caps).major, maximum(cuda_caps).minor, :baseline)
     end
 
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = compiler_version() >= v"11.7"
 
-    # Conservatively pick baseline for backward compatibility,
-    # requiring explicit opt-in for family- and architecture-specific instructions. 
-    feature_set = something(feature_set, :baseline)
-    validate_feature_set(cuda_cap, cuda_ptx, feature_set)
+    # An invalid (cap, feature_set, ptx) variant combination simply has no entry in
+    # `ptx_cap_db`: e.g. `sm"7.0a"` (no `*a` below CC 9.0) or `sm"10.0f"` at PTX 8.7
+    # (`*f` needs 8.8). Only enforced for variants -- baseline lets inspection tools
+    # like `code_ptx` mix `cap` and `ptx` freely, even if the result won't actually load.
+    if cuda_sm.feature_set !== :baseline
+        cuda_sm in ptx_cap_support(cuda_ptx) ||
+            error("$(cpu_name(cuda_sm)) is not supported by PTX ISA $(cuda_ptx)")
+    end
+
+    # GPUCompiler's PTXCompilerTarget needs an `sm` the *installed* LLVM understands
+    # (the `*a`/`*f` CPU names are version-gated, see `llvm_cap_db`). If LLVM is too
+    # old to know the variant for the LLVM-side cap, fall back to baseline; the post-mcgen
+    # rewrite then patches `.target` to the requested value -- which is enough for
+    # inline-PTX users but won't unlock LLVM's arch-accelerated code paths. ptxas is more
+    # permissive than LLVM about claiming a more-specific `.target` than the IR actually
+    # uses, so the upgrade is safe; the reverse would not be.
+    llvm_sm = SMVersion(llvm_cap.major, llvm_cap.minor, cuda_sm.feature_set)
+    if !(llvm_sm in llvm_cap_support(LLVM.version()))
+        if cuda_sm.feature_set !== :baseline
+            @warn "LLVM $(LLVM.version()) does not support $(cpu_name(llvm_sm)) " *
+                  "(needed to compile for $(cpu_name(cuda_sm))); LLVM will emit baseline " *
+                  "code (inline PTX assembly is unaffected)."
+        end
+        llvm_sm = SMVersion(llvm_sm.major, llvm_sm.minor, :baseline)
+    end
 
     # create GPUCompiler objects
-    target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set)
+    target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx,
+                                 feature_set=llvm_sm.feature_set, debuginfo, kwargs...)
+    params = CUDACompilerParams(; sm=cuda_sm, ptx=cuda_ptx)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -293,8 +318,9 @@ function compile(@nospecialize(job::CompilerJob))
         push!(ptxas_opts, "--compile-only")
     end
 
-    (; ptx, cap, feature_set) = job.config.params
-    arch = format_target(cap, feature_set)
+    (; ptx, sm) = job.config.params
+    cap = base_version(sm)
+    arch = cpu_name(sm)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
@@ -307,7 +333,7 @@ function compile(@nospecialize(job::CompilerJob))
     end
     if param_usage > param_limit
         msg = """Kernel invocation uses too much parameter memory.
-                 $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by sm_$(cap.major)$(cap.minor) / PTX v$(ptx.major).$(ptx.minor)."""
+                 $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by $(arch) / PTX v$(ptx.major).$(ptx.minor)."""
 
         try
             details = "\n\nRelevant parameters:"
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index 619acaa327..a67a3f61ed 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
 
@@ -433,8 +433,11 @@ The following keyword arguments are supported:
 - `name`: override the name that the kernel will have in the generated code
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
-- `cap` and `ptx`: to override the compute capability and PTX version to compile for
-- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch`
+- `cap` and `ptx`: to override the compute capability and PTX version to compile for.
+  `cap` accepts an [`SMVersion`](@ref) via the `sm"..."` string macro, e.g.
+  `cap=sm"10.3a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"10.0f"`
+  for family-portable Blackwell codegen. The bare form `cap=sm"10.3"` selects baseline
+  (forward-compatible) codegen. Passing a `VersionNumber` is deprecated.
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
new file mode 100644
index 0000000000..703d798159
--- /dev/null
+++ b/CUDACore/src/compiler/sm.jl
@@ -0,0 +1,56 @@
+# SMVersion: a PTX compilation target, identifying a CUDA compute capability together
+# with its subtarget feature set.
+#
+# Constructed via the `sm"..."` string macro, which accepts the dotted form used in
+# CUDA's documentation: `sm"10.3a"` is compute capability 10.3 with architecture-
+# accelerated features. The bare form `sm"10.3"` is the default, forward-compatible
+# feature set (the "onion model").
+#
+# See `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature
+# definitions, and CUDA's PTX ISA documentation under `.target` for the runtime
+# compatibility implications:
+#
+#   :baseline (no suffix)   - forward-compatible (sm_X for any sm_Y >= X)
+#   :family   ('f' suffix)  - same-major-family-portable
+#   :arch     ('a' suffix)  - locked to one exact CC
+
+export SMVersion, @sm_str
+
+struct SMVersion
+    major::Int
+    minor::Int
+    feature_set::Symbol
+
+    function SMVersion(major::Integer, minor::Integer, feature_set::Symbol = :baseline)
+        feature_set in (:baseline, :family, :arch) ||
+            error("SMVersion feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
+        return new(Int(major), Int(minor), feature_set)
+    end
+end
+
+# Suffix on the LLVM CPU name / `.target` directive
+suffix(sm::SMVersion) = sm.feature_set === :arch    ? "a" :
+                        sm.feature_set === :family  ? "f" : ""
+
+# LLVM CPU / PTX `.target` name (e.g. "sm_103a").
+cpu_name(sm::SMVersion) = "sm_$(sm.major)$(sm.minor)$(suffix(sm))"
+
+# Drop the feature set to recover the base compute-capability `VersionNumber`,
+# usable against the version-keyed compatibility databases.
+base_version(sm::SMVersion) = VersionNumber(sm.major, sm.minor)
+
+Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"")
+
+function _parse_sm(s::AbstractString)
+    m = match(r"^(\d+)\.(\d+)([af]?)$", s)
+    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"")
+    major = parse(Int, m.captures[1])
+    minor = parse(Int, m.captures[2])
+    fs = m.captures[3] == "a" ? :arch :
+         m.captures[3] == "f" ? :family : :baseline
+    return SMVersion(major, minor, fs)
+end
+
+macro sm_str(s)
+    return _parse_sm(s)
+end
diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl
index cd3687f0ea..4457d37330 100644
--- a/CUDACore/src/device/runtime.jl
+++ b/CUDACore/src/device/runtime.jl
@@ -19,7 +19,7 @@ function precompile_runtime()
             # NOTE: this often runs when we don't have a functioning set-up,
             #       so we don't use `compiler_config` which requires NVML
             target = PTXCompilerTarget(; cap, ptx, debuginfo)
-            params = CUDACompilerParams(; cap, ptx)
+            params = CUDACompilerParams(; sm=SMVersion(cap.major, cap.minor), ptx)
             config = CompilerConfig(target, params)
             job = CompilerJob(mi, config)
             GPUCompiler.load_runtime(job)
diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl
index 58ae94f536..e145f63467 100644
--- a/CUDACore/src/precompile.jl
+++ b/CUDACore/src/precompile.jl
@@ -17,7 +17,7 @@ if :NVPTX in LLVM.backends()
             llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))
 
             target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
-            params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx)
+            params = CUDACompilerParams(; sm=SMVersion(llvm_cap.major, llvm_cap.minor), ptx=llvm_ptx)
             config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)
 
             tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
index 6049552626..e347cf8300 100644
--- a/perf/volumerhs.jl
+++ b/perf/volumerhs.jl
@@ -232,11 +232,33 @@ function main()
               - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory,
                 $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
                 $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
+
+    # Run once to validate: the result must be finite and the L1 sum must
+    # match a baked-in reference computed from this same StableRNG(123)
+    # seed. cuTile/perf/volumerhs.jl uses the same reference so the two
+    # implementations can be cross-checked.
+    CUDA.@sync kernel(rhs, Q, vgeo, DFloat(grav), D, nelem;
+                       threads=threads, blocks=nelem)
+    rhs_h = Array(rhs)
+    @assert all(isfinite, rhs_h) "kernel produced non-finite values"
+    rsum = sum(rhs_h)
+    ref  = 1.4227473f10
+    rel  = abs(rsum - ref) / abs(ref)
+    @assert rel < 1f-3 "rhs checksum off by $rel (got $rsum, expected $ref)"
+    @info "validation passed" rhs_sum=rsum reference=ref rel_err=rel
+    fill!(rhs, 0)
+
     results = @benchmark begin
+        # zero rhs each iteration so accumulation stays meaningful
+        fill!($rhs, 0)
         CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
                                          threads=$threads, blocks=$nelem)
     end
 
+    bytes = nelem * 28 * Nq^3 * sizeof(DFloat)
+    bw    = bytes / (minimum(results).time / 1e9) / 1e9
+    @info "SIMT volumerhs! benchmark" min_ms=minimum(results).time/1e6 median_ms=median(results).time/1e6 effective_BW="$(round(Int, bw)) GB/s"
+
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
     CUDA.unsafe_free!(rhs)
     CUDA.unsafe_free!(Q)
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 2096439b6b..ffac61be81 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -255,42 +255,74 @@ end
 
     @test !success(run_ptxas(asm_pre, "sm_75"))
 
-    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline)
+    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0")
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
 
     # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
     # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
-    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch)
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0a")
     @test occursin(".target sm_90a", asm_arch)
     @test success(run_ptxas(asm_arch, "sm_90a"))
 
     # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
-    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family)
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"10.0f")
     @test occursin(".target sm_100f", asm_family)
     @test success(run_ptxas(asm_family, "sm_100f"))
 end
 
+@testset "SMVersion and sm\"...\" macro" begin
+    @test sm"9.0"   == SMVersion(9, 0, :baseline)
+    @test sm"9.0a"  == SMVersion(9, 0, :arch)
+    @test sm"10.0f" == SMVersion(10, 0, :family)
+    # printing roundtrips via the macro form
+    @test sprint(show, sm"10.3a") == "sm\"10.3a\""
+    @test sprint(show, sm"10.0")  == "sm\"10.0\""
+    # cpu_name reflects feature_set
+    @test CUDACore.cpu_name(sm"9.0")   == "sm_90"
+    @test CUDACore.cpu_name(sm"9.0a")  == "sm_90a"
+    @test CUDACore.cpu_name(sm"10.0f") == "sm_100f"
+    # base_version drops the suffix back to a comparable VersionNumber
+    @test CUDACore.base_version(sm"10.3a") == v"10.3"
+    # constructor rejects bogus feature_set
+    @test_throws ErrorException SMVersion(9, 0, :bogus)
+    # macro rejects malformed strings
+    @test_throws ErrorException CUDACore._parse_sm("103a")     # missing dot
+    @test_throws ErrorException CUDACore._parse_sm("10.3x")    # unknown suffix
+    @test_throws ErrorException CUDACore._parse_sm("10")       # missing minor
+end
+
 @testset "CUDACompilerParams hash discriminates on feature_set" begin
     # Without feature_set in the hash, two params differing only on feature_set would collide
     # in the compiler cache and silently return a cubin compiled for the wrong feature set.
-    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
-    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch)
+    base = CUDACore.CUDACompilerParams(sm=sm"9.0",  ptx=v"8.0")
+    arch = CUDACore.CUDACompilerParams(sm=sm"9.0a", ptx=v"8.0")
     @test hash(base) != hash(arch)
     @test base != arch
 end
 
-@testset "validate_feature_set" begin
-    # Architecture-specific needs CC >= 9.0 and PTX >= 8.0
-    # Family-specific needs CC >= 10.0 and PTX >= 8.8.
-    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch)
-    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch)
-    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
-    @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
-    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :arch) === nothing
-    @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
-    @test CUDACore.validate_feature_set(v"5.0",  v"6.2", :baseline) === nothing
+@testset "ptx_cap_support" begin
+    # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"9.0") and PTX >= 8.0.
+    # Family-specific needs CC >= 10.0 (no `*f` keys below sm"10.0") and PTX >= 8.8.
+    @test !(sm"8.6a" in CUDACore.ptx_cap_support(v"8.0"))   # no `*a` below CC 9.0
+    @test !(sm"9.0a" in CUDACore.ptx_cap_support(v"7.8"))   # `*a` requires PTX >= 8.0
+    @test !(sm"9.0f" in CUDACore.ptx_cap_support(v"8.0"))   # no `*f` below CC 10.0
+    @test !(sm"10.0f" in CUDACore.ptx_cap_support(v"8.7"))  # `*f` requires PTX >= 8.8
+    @test  sm"9.0a"  in CUDACore.ptx_cap_support(v"8.0")
+    @test  sm"10.0f" in CUDACore.ptx_cap_support(v"8.8")
+    @test  sm"5.0"   in CUDACore.ptx_cap_support(v"6.2")
+end
+
+@testset "llvm_cap_support" begin
+    # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td.
+    @test  sm"10.3a" in CUDACore.llvm_cap_support(v"21")
+    @test  sm"10.0f" in CUDACore.llvm_cap_support(v"21")
+    @test  sm"9.0a"  in CUDACore.llvm_cap_support(v"18")
+    @test !(sm"9.0a"  in CUDACore.llvm_cap_support(v"17"))  # sm_90a added in LLVM 18
+    @test !(sm"10.3a" in CUDACore.llvm_cap_support(v"20"))  # sm_103a added in LLVM 21
+    @test !(sm"10.0f" in CUDACore.llvm_cap_support(v"20"))  # sm_100f added in LLVM 21
+    @test  sm"7.0"   in CUDACore.llvm_cap_support(v"15")
 end
 
 end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index edf589913d..f54da84bef 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -50,17 +50,40 @@ end
     @cuda threads=2 dummy()
 
     # sm_10 isn't supported by LLVM
-    @test_throws "not supported by LLVM" @cuda launch=false cap=v"1.0" dummy()
+    @test_throws "not supported by LLVM" @cuda launch=false cap=sm"1.0" dummy()
     # sm_20 is, but not by any CUDA version we support
-    @test_throws "Failed to compile PTX code" @cuda launch=false cap=v"2.0" dummy()
+    @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"2.0" dummy()
     # there isn't any capability other than the device's that's guaruanteed to work
-    @cuda launch=false cap=capability(device()) dummy()
+    dev_cap = capability(device())
+    dev_sm = SMVersion(dev_cap.major, dev_cap.minor)
+    @cuda launch=false cap=dev_sm dummy()
     # but we should be able to see it in the generated PTX code
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0"))
+    asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"5.0"))
     @test contains(asm, ".target sm_50")
 
     asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3"))
     @test contains(asm, ".version 6.3")
+
+    # feature_set is selected by the suffix on the sm"..." string; the suffix should
+    # surface in the .target directive in the PTX output.
+    if dev_cap >= v"9.0"
+        sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch)
+        asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_a))
+        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+        # arch-specific cubin should also actually launch on the matching device
+        @cuda cap=sm_a dummy()
+    end
+    if dev_cap >= v"10.0"
+        sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
+        asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_f))
+        @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))")
+        @cuda cap=sm_f dummy()
+    end
+
+    # passing a VersionNumber to `cap` is deprecated; check the depwarn fires while
+    # the path still produces the right PTX. (Uses code_ptx to skip ptxas, which on
+    # newer CUDA toolkits no longer accepts sm_50.)
+    @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0"))
 end
 
 

From 9450ef74d5ab51704c5f7d258a22485e04709c01 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 11:02:32 +0200
Subject: [PATCH 07/16] Rework device compatibility handling.

---
 CUDACore/lib/cudadrv/state.jl             |   2 +-
 CUDACore/src/compatibility.jl             |  74 +++----------
 CUDACore/src/compiler/compilation.jl      | 123 +++++++++-------------
 CUDACore/src/compiler/sm.jl               |  17 +++
 CUDACore/src/device/intrinsics/version.jl |  17 ++-
 CUDACore/src/device/runtime.jl            |  10 +-
 CUDACore/src/precompile.jl                |  11 +-
 test/core/execution.jl                    |  46 +++++++-
 8 files changed, 153 insertions(+), 147 deletions(-)

diff --git a/CUDACore/lib/cudadrv/state.jl b/CUDACore/lib/cudadrv/state.jl
index 6c03564bb8..32159136fa 100644
--- a/CUDACore/lib/cudadrv/state.jl
+++ b/CUDACore/lib/cudadrv/state.jl
@@ -227,7 +227,7 @@ function context(dev::CuDevice)
                maxlog=1, _id=devidx)
     end
     # ... or too new
-    if !in(capability(dev), cuda_compat().cap)
+    if !in(capability(dev), ptxas_compat().cap)
         @warn("""Your $(name(dev)) GPU (compute capability $(capability(dev).major).$(capability(dev).minor)) is not fully supported by CUDA $(runtime_version()).
                  Some functionality may be broken. Ensure you are using the latest version of CUDA.jl in combination with an up-to-date NVIDIA driver.
                  If that does not help, please file an issue to add support for the latest CUDA toolkit.""",
diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index 770d717834..71a0dfc0f0 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -36,12 +36,12 @@ Base.intersect(v::VersionNumber, r::VersionRange) =
     v > r.upper ? (v:r.upper) : (v:v)
 
 
-## devices supported by the CUDA toolkit
+## devices supported by ptxas
 
 # Source:
 # - https://en.wikipedia.org/wiki/CUDA#GPUs_supported
 # - ptxas |& grep -A 10 '\--gpu-name'
-const cuda_cap_db = Dict(
+const ptxas_cap_db = Dict(
     v"1.0"   => between(lowest, v"6.5"),
     v"1.1"   => between(lowest, v"6.5"),
     v"1.2"   => between(lowest, v"6.5"),
@@ -73,9 +73,9 @@ const cuda_cap_db = Dict(
     v"12.1"  => between(v"12.9", highest),
 )
 
-function cuda_cap_support(ver::VersionNumber)
+function ptxas_cap_support(ver::VersionNumber)
     caps = Set{VersionNumber}()
-    for (cap,r) in cuda_cap_db
+    for (cap,r) in ptxas_cap_db
         if ver in r
             push!(caps, cap)
         end
@@ -84,10 +84,10 @@ function cuda_cap_support(ver::VersionNumber)
 end
 
 
-## PTX ISAs supported by the CUDA toolkit
+## PTX ISAs supported by ptxas
 
 # Source: PTX ISA document, Release History table
-const cuda_ptx_db = Dict(
+const ptxas_ptx_db = Dict(
     v"1.0" => between(v"1.0", highest),
     v"1.1" => between(v"1.1", highest),
     v"1.2" => between(v"2.0", highest),
@@ -135,9 +135,9 @@ const cuda_ptx_db = Dict(
     v"9.2" => between(v"13.2", highest),
 )
 
-function cuda_ptx_support(ver::VersionNumber)
+function ptxas_ptx_support(ver::VersionNumber)
     caps = Set{VersionNumber}()
-    for (cap,r) in cuda_ptx_db
+    for (cap,r) in ptxas_ptx_db
         if ver in r
             push!(caps, cap)
         end
@@ -203,17 +203,6 @@ function ptx_cap_support(ver::VersionNumber)
     return caps
 end
 
-# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic.
-function ptx_baseline_caps(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap, r) in ptx_cap_db
-        if cap.feature_set === :baseline && ver in r
-            push!(caps, base_version(cap))
-        end
-    end
-    return caps
-end
-
 
 ## devices supported by the LLVM NVPTX back-end
 
@@ -270,17 +259,6 @@ function llvm_cap_support(ver::VersionNumber)
     return caps
 end
 
-# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic.
-function llvm_baseline_caps(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap, r) in llvm_cap_db
-        if cap.feature_set === :baseline && ver in r
-            push!(caps, base_version(cap))
-        end
-    end
-    return caps
-end
-
 
 ## PTX ISAs supported by the LVM NVPTX back-end
 
@@ -337,37 +315,11 @@ end
 function llvm_compat(version=LLVM.version())
     LLVM.InitializeNVPTXTarget()
 
-    # the `.cap` field is used for the base-cap clamp in `_compiler_config`, so only
-    # baseline entries are surfaced here. Variant support is queried point-wise via
-    # `sm in llvm_cap_support(...)`.
-    cap_support = sort(collect(llvm_baseline_caps(version)))
-    ptx_support = sort(collect(llvm_ptx_support(version)))
-
-    return (cap=cap_support, ptx=ptx_support)
-end
-
-function cuda_compat(runtime=runtime_version(), compiler=compiler_version())
-    # we don't have to check the driver version, because it offers backwards compatbility
-    # beyond the CUDA toolkit version (e.g. R580 for CUDA 13 still supports Volta as
-    # deprecated in CUDA 13), and we don't have a reliable way to query the actual version
-    # as NVML isn't available on all platforms. let's instead simply assume that unsupported
-    # devices will not be exposed to the CUDA runtime and thus won't be visible to us.
-
-    # the compiler and runtime are versioned independently (and either can come from a
-    # local install), so we need to consider both:
-    # - device caps are dropped when either ptxas can't emit for them or the runtime
-    #   libraries drop them. take the intersection of both supported sets.
-    # - PTX ISA availability is a property of ptxas; the runtime doesn't care which ISA
-    #   compiled cubin came from.
-    cap_support = sort(collect(intersect(cuda_cap_support(runtime),
-                                         cuda_cap_support(compiler))))
-    ptx_support = sort(collect(cuda_ptx_support(compiler)))
-
-    return (cap=cap_support, ptx=ptx_support)
+    return (cap=llvm_cap_support(version),
+            ptx=llvm_ptx_support(version))
 end
 
-function ptx_compat(ptx)
-    # Baseline view for the clamp; variant support is queried point-wise via
-    # `sm in ptx_cap_support(...)`.
-    return (cap=ptx_baseline_caps(ptx),)
+function ptxas_compat(version=compiler_version())
+    return (cap=ptxas_cap_support(version),
+            ptx=ptxas_ptx_support(version))
 end
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 9486603dc5..32ff9f3375 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -194,96 +194,73 @@ end
         cap = SMVersion(cap.major, cap.minor, :baseline)
     end
 
-    # determine the toolchain
+    # inspect the toolchain
     llvm_support = llvm_compat()
-    cuda_support = cuda_compat()
+    ptxas_support = ptxas_compat()
 
-    # determine the PTX ISA to use. we want at least 6.2, but will use newer if possible.
-    requested_ptx = something(ptx, v"6.2")
-    llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
-    cuda_ptxs = filter(>=(requested_ptx), cuda_support.ptx)
+    # determine the PTX ISA to use.
     if ptx !== nothing
-        # the user requested a specific PTX ISA
-        ## use the highest ISA supported by LLVM
-        isempty(llvm_ptxs) &&
+        # explicit request: take it exactly, validating against the toolchain
+        ptx in llvm_support.ptx ||
             error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())")
-        llvm_ptx = maximum(llvm_ptxs)
-        ## use the ISA as-is to invoke CUDA
-        cuda_ptx = ptx
+        ptx in ptxas_support.ptx ||
+            error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())")
+        llvm_ptx = ptxas_ptx = ptx
     else
-        # try to do the best thing (i.e., use the newest PTX ISA)
-        # XXX: is it safe to just use the latest PTX ISA? isn't it possible for, e.g.,
-        #      instructions to get deprecated?
+        # default: pick the newest PTX ISA supported by the toolchain (>=v6.2)
+        requested_ptx = v"6.2"
+        llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx)
+        ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx)
         isempty(llvm_ptxs) &&
             error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())")
-        llvm_ptx = maximum(llvm_ptxs)
-        isempty(cuda_ptxs) &&
-            error("CUDA.jl requires PTX $requested_ptx, which is not supported by CUDA $(compiler_version())")
-        cuda_ptx = maximum(cuda_ptxs)
+        isempty(ptxas_ptxs) &&
+            error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())")
+        ptxas_ptx = maximum(ptxas_ptxs)
+        llvm_ptx = min(maximum(llvm_ptxs), ptxas_ptx)
     end
 
-    # determine the compute capabilities to use. this should match the capability of the
-    # current device, but if LLVM doesn't support it, we can target an older capability
-    # and pass a different `-arch` to `ptxas`. The feature_set rides on `cap::SMVersion`;
-    # the clamp logic below operates on the base version, with the feature_set carried
-    # through unchanged.
-    ptx_support = ptx_compat(cuda_ptx)
-    base_cap = cap === nothing ? nothing : base_version(cap)
-    requested_cap = @something(base_cap, min(capability(dev), maximum(ptx_support.cap)))
-    llvm_caps = filter(<=(requested_cap), llvm_support.cap)
+    # when selecting compute capabilities, we prefer the most recent one, as
+    # well as prefer to use architecture-accelerated features when available.
+    fs_rank(fs::Symbol) = fs === :arch ? 2 : fs === :family ? 1 : 0
+    sm_key(sm::SMVersion) = (base_version(sm), fs_rank(sm.feature_set))
+
+    # determine the compute capability to use.
+    ## ptxas
+    ptx_caps = ptx_cap_support(ptxas_ptx)
     if cap !== nothing
-        ## use the highest capability supported by LLVM
-        isempty(llvm_caps) &&
-            error("Requested compute capability $cap is not supported by LLVM $(LLVM.version())")
-        llvm_cap = maximum(llvm_caps)
-        ## use the capability as-is to invoke CUDA
-        cuda_sm = cap
+        # explicit request: take it as-is, validating against the PTX ISA
+        cap in ptx_caps ||
+            error("$(cpu_name(cap)) is not supported by PTX ISA $(ptxas_ptx)")
+        ptxas_sm = cap
     else
-        ## use the highest capability supported by LLVM
-        isempty(llvm_caps) &&
-            error("Compute capability $(requested_cap) is not supported by LLVM $(LLVM.version())")
-        llvm_cap = maximum(llvm_caps)
-        ## use the highest capability supported by CUDA -- always baseline when defaulted
-        ## from the device (the device cap has no feature_set)
-        cuda_caps = filter(<=(capability(dev)), cuda_support.cap)
-        isempty(cuda_caps) &&
-            error("Compute capability $(requested_cap) is not supported by CUDA $(runtime_version())")
-        cuda_sm = SMVersion(maximum(cuda_caps).major, maximum(cuda_caps).minor, :baseline)
+        # pick the most specific capability the selected PTX ISA supports whose cubin
+        # would actually load on the current device. For baseline that's the onion model;
+        # `:arch` requires an exact CC match, `:family` a same-family match.
+        ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_caps)
+        isempty(ptxas_candidates) &&
+            error("Compute capability $(capability(dev)) is not supported by ptxas " *
+                  "$(compiler_version()) at PTX ISA $(ptxas_ptx)")
+        ptxas_sm = argmax(sm_key, ptxas_candidates)
     end
-
-    # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
-    debuginfo = compiler_version() >= v"11.7"
-
-    # An invalid (cap, feature_set, ptx) variant combination simply has no entry in
-    # `ptx_cap_db`: e.g. `sm"7.0a"` (no `*a` below CC 9.0) or `sm"10.0f"` at PTX 8.7
-    # (`*f` needs 8.8). Only enforced for variants -- baseline lets inspection tools
-    # like `code_ptx` mix `cap` and `ptx` freely, even if the result won't actually load.
-    if cuda_sm.feature_set !== :baseline
-        cuda_sm in ptx_cap_support(cuda_ptx) ||
-            error("$(cpu_name(cuda_sm)) is not supported by PTX ISA $(cuda_ptx)")
-    end
-
-    # GPUCompiler's PTXCompilerTarget needs an `sm` the *installed* LLVM understands
-    # (the `*a`/`*f` CPU names are version-gated, see `llvm_cap_db`). If LLVM is too
-    # old to know the variant for the LLVM-side cap, fall back to baseline; the post-mcgen
-    # rewrite then patches `.target` to the requested value -- which is enough for
-    # inline-PTX users but won't unlock LLVM's arch-accelerated code paths. ptxas is more
-    # permissive than LLVM about claiming a more-specific `.target` than the IR actually
-    # uses, so the upgrade is safe; the reverse would not be.
-    llvm_sm = SMVersion(llvm_cap.major, llvm_cap.minor, cuda_sm.feature_set)
-    if !(llvm_sm in llvm_cap_support(LLVM.version()))
-        if cuda_sm.feature_set !== :baseline
-            @warn "LLVM $(LLVM.version()) does not support $(cpu_name(llvm_sm)) " *
-                  "(needed to compile for $(cpu_name(cuda_sm))); LLVM will emit baseline " *
-                  "code (inline PTX assembly is unaffected)."
+    ## LLVM
+    if ptxas_sm in llvm_support.cap
+        llvm_sm = ptxas_sm
+    else
+        # Exact `ptxas_sm` unavailable in LLVM. Fall back to baseline LLVM at a
+        # lower base, since arch/family features don't carry across versions.
+        baseline_candidates = filter(llvm_support.cap) do sm
+            sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
         end
-        llvm_sm = SMVersion(llvm_sm.major, llvm_sm.minor, :baseline)
+        isempty(baseline_candidates) &&
+            error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())")
+        llvm_sm = argmax(sm_key, baseline_candidates)
     end
 
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx,
-                                 feature_set=llvm_sm.feature_set, debuginfo, kwargs...)
-    params = CUDACompilerParams(; sm=cuda_sm, ptx=cuda_ptx)
+                                 feature_set=llvm_sm.feature_set,
+                                 debuginfo=true, kwargs...)
+    params = CUDACompilerParams(; sm=ptxas_sm, ptx=ptxas_ptx)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
index 703d798159..a96849d72f 100644
--- a/CUDACore/src/compiler/sm.jl
+++ b/CUDACore/src/compiler/sm.jl
@@ -39,6 +39,23 @@ cpu_name(sm::SMVersion) = "sm_$(sm.major)$(sm.minor)$(suffix(sm))"
 # usable against the version-keyed compatibility databases.
 base_version(sm::SMVersion) = VersionNumber(sm.major, sm.minor)
 
+# Would a cubin compiled for `sm` actually load and run on a device with capability
+# `dev_cap`? Per NVIDIA's PTX ISA reference (.target directive):
+#   - baseline: forward-compatible (onion model) -- any sm_X runs on sm_Y for Y >= X.
+#   - family:   same architecture family (currently == same major) and forward-portable
+#               within the family.
+#   - arch:     locked to one exact CC; cubin only loads on devices with that exact cap.
+function runs_on(sm::SMVersion, dev_cap::VersionNumber)
+    if sm.feature_set === :arch
+        return base_version(sm) == dev_cap
+    elseif sm.feature_set === :family
+        return sm.major == dev_cap.major && base_version(sm) <= dev_cap
+    else  # :baseline
+        return base_version(sm) <= dev_cap
+    end
+end
+
+
 Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"")
 
 function _parse_sm(s::AbstractString)
diff --git a/CUDACore/src/device/intrinsics/version.jl b/CUDACore/src/device/intrinsics/version.jl
index 6114833eeb..ea66448bda 100644
--- a/CUDACore/src/device/intrinsics/version.jl
+++ b/CUDACore/src/device/intrinsics/version.jl
@@ -1,8 +1,8 @@
 # device intrinsics for querying the compute SimpleVersion and PTX ISA version
 
-export compute_capability, ptx_isa_version
+export compute_capability, ptx_isa_version, target_feature_set
 
-for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"]
+for var in ["sm_major", "sm_minor", "sm_features", "ptx_major", "ptx_minor"]
     @eval @device_function @inline $(Symbol(var))() =
         Base.llvmcall(
             $("""@$var = external global i32
@@ -17,3 +17,16 @@ end
 @device_function @inline compute_capability() = SimpleVersion(sm_major(), sm_minor())
 @device_function @inline ptx_isa_version() = SimpleVersion(ptx_major(), ptx_minor())
 
+# Feature set encoded in the `.target` directive: one of `:baseline`, `:family`, `:arch`.
+# (NVIDIA's PTX ISA reference: ".target specifies the set of features in the target
+# architecture for which the current PTX code was generated.") GPUCompiler stamps the
+# encoding in via the `sm_features` LLVM global, using `GPUCompiler.TargetFeatureSet`;
+# the integer load + chained compare folds away after LLVM inlines the constant, so
+# user code like `if target_feature_set() === :arch ... end` resolves to a single
+# branch in the PTX output.
+@device_function @inline function target_feature_set()
+    f = sm_features()
+    return f == UInt32(GPUCompiler.ArchFeatures)   ? :arch :
+           f == UInt32(GPUCompiler.FamilyFeatures) ? :family : :baseline
+end
+
diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl
index 4457d37330..93e0cd90c5 100644
--- a/CUDACore/src/device/runtime.jl
+++ b/CUDACore/src/device/runtime.jl
@@ -12,14 +12,16 @@ function precompile_runtime()
     f = ()->return
     mi = methodinstance(typeof(f), Tuple{})
 
-    caps = llvm_compat().cap
+    # `.cap` is now keyed by `SMVersion` and includes variants; runtime caches are
+    # feature_set-agnostic, so we only warm the baseline entries.
+    sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().cap)
     ptx = maximum(llvm_compat().ptx)
     JuliaContext() do ctx
-        for cap in caps, debuginfo in [false, true]
+        for sm in sms, debuginfo in [false, true]
             # NOTE: this often runs when we don't have a functioning set-up,
             #       so we don't use `compiler_config` which requires NVML
-            target = PTXCompilerTarget(; cap, ptx, debuginfo)
-            params = CUDACompilerParams(; sm=SMVersion(cap.major, cap.minor), ptx)
+            target = PTXCompilerTarget(; cap=base_version(sm), ptx, debuginfo)
+            params = CUDACompilerParams(; sm, ptx)
             config = CompilerConfig(target, params)
             job = CompilerJob(mi, config)
             GPUCompiler.load_runtime(job)
diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl
index e145f63467..a5c39b1cd8 100644
--- a/CUDACore/src/precompile.jl
+++ b/CUDACore/src/precompile.jl
@@ -13,11 +13,16 @@ if :NVPTX in LLVM.backends()
             end
 
             llvm_support = llvm_compat()
-            llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap))
+            # `.cap` is keyed by `SMVersion` and includes variants; pick the highest
+            # baseline cap <= v"7.5" for a portable precompile artifact.
+            llvm_sm = argmax(base_version,
+                             filter(sm -> sm.feature_set === :baseline &&
+                                          base_version(sm) <= v"7.5",
+                                    llvm_support.cap))
             llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))
 
-            target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
-            params = CUDACompilerParams(; sm=SMVersion(llvm_cap.major, llvm_cap.minor), ptx=llvm_ptx)
+            target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, debuginfo=true)
+            params = CUDACompilerParams(; sm=llvm_sm, ptx=llvm_ptx)
             config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)
 
             tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
diff --git a/test/core/execution.jl b/test/core/execution.jl
index f54da84bef..ddb871b936 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -61,20 +61,30 @@ end
     asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"5.0"))
     @test contains(asm, ".target sm_50")
 
+    # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the
+    # `.version` line should match what was asked for, independently of what LLVM and
+    # ptxas would natively pick.
     asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3"))
     @test contains(asm, ".version 6.3")
 
+    # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it
+    # used to be); a clearly out-of-range value must error at config time.
+    @test_throws "not supported" @cuda launch=false ptx=v"99.0" dummy()
+
     # feature_set is selected by the suffix on the sm"..." string; the suffix should
-    # surface in the .target directive in the PTX output.
+    # surface in the .target directive in the PTX output. The cuda-side `.target` is
+    # the variant regardless of LLVM support -- the mcgen rewrite stamps it in even
+    # when LLVM clamped to baseline for codegen.
+    sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch)
+    sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
+
     if dev_cap >= v"9.0"
-        sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch)
         asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_a))
         @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
         # arch-specific cubin should also actually launch on the matching device
         @cuda cap=sm_a dummy()
     end
     if dev_cap >= v"10.0"
-        sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
         asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_f))
         @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))")
         @cuda cap=sm_f dummy()
@@ -84,6 +94,36 @@ end
     # the path still produces the right PTX. (Uses code_ptx to skip ptxas, which on
     # newer CUDA toolkits no longer accepts sm_50.)
     @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0"))
+
+    # With no explicit `cap=`, we default to architecture-specific code paths on CC >=9.0
+    # since we know the exact device. The cuda-side `.target` is the variant regardless of
+    # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs.
+    if dev_cap >= v"9.0"
+        asm = sprint(io->CUDA.code_ptx(io, dummy, ()))
+        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+    end
+
+    # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built
+    # for (not the cuda-side .target): when LLVM doesn't natively support the exact variant,
+    # we fall back to baseline LLVM, so the global reflects baseline. The if-chain folds at
+    # codegen time, so the launched kernel writes a single constant.
+    function read_feature_set!(out)
+        @inbounds out[1] = if target_feature_set() === :arch
+            UInt32(2)
+        elseif target_feature_set() === :family
+            UInt32(1)
+        else
+            UInt32(0)
+        end
+        return
+    end
+    out = CuArray{UInt32}([typemax(UInt32)])
+    @cuda threads=1 read_feature_set!(out)
+    # arch features come through `target_feature_set()` only when LLVM natively supported
+    # the variant; otherwise we fell back to baseline LLVM and the global reflects that.
+    arch_in_llvm = sm_a in CUDACore.llvm_cap_support(CUDACore.LLVM.version())
+    expected = dev_cap >= v"9.0" && arch_in_llvm ? UInt32(2) : UInt32(0)
+    @test Array(out)[1] == expected
 end
 
 

From 5e385829a24832dc81e69af09ba7302c046956ec Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 11:41:27 +0200
Subject: [PATCH 08/16] Report on detected compiler support.

---
 CUDATools/src/utilities.jl | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/CUDATools/src/utilities.jl b/CUDATools/src/utilities.jl
index 079ad71eb3..c14453cb7a 100644
--- a/CUDATools/src/utilities.jl
+++ b/CUDATools/src/utilities.jl
@@ -142,5 +142,27 @@ function versioninfo(io::IO=stdout)
             query_cuda()
         end
         println(io, "  $(i-1): $str (sm_$(cap.major)$(cap.minor), $(Base.format_bytes(mem.free)) / $(Base.format_bytes(mem.total)) available)")
+
+        # report the default compilation target we'd select for this device
+        config = try
+            CUDACore.compiler_config(dev)
+        catch
+            nothing
+        end
+        if config !== nothing
+            ptxas_sm  = config.params.sm
+            ptxas_ptx = config.params.ptx
+            llvm_sm   = CUDACore.SMVersion(config.target.cap.major,
+                                           config.target.cap.minor,
+                                           config.target.feature_set)
+            llvm_ptx  = config.target.ptx
+            ptxas_str = "$(CUDACore.cpu_name(ptxas_sm)) / PTX $(ptxas_ptx.major).$(ptxas_ptx.minor)"
+            if llvm_sm == ptxas_sm && llvm_ptx == ptxas_ptx
+                println(io, "     compiles to $ptxas_str")
+            else
+                llvm_str = "$(CUDACore.cpu_name(llvm_sm)) / PTX $(llvm_ptx.major).$(llvm_ptx.minor)"
+                println(io, "     compiles to $ptxas_str (LLVM: $llvm_str)")
+            end
+        end
     end
 end

From 2158f425620a92922d1cc7ee9584175d4da86f61 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 11:45:38 +0200
Subject: [PATCH 09/16] Bump GPUCompiler.

---
 CUDACore/Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index bab293aba3..2c9992781e 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.10"
+GPUCompiler = "1.12"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"

From fba88cd0a823f2dda98b570e426ecafc5a20b815 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 12:58:27 +0200
Subject: [PATCH 10/16] Improve test.

---
 CUDACore/src/compiler/sm.jl | 22 +++++++++++-----------
 test/core/codegen.jl        |  6 +++---
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
index a96849d72f..c29f2783a3 100644
--- a/CUDACore/src/compiler/sm.jl
+++ b/CUDACore/src/compiler/sm.jl
@@ -28,6 +28,16 @@ struct SMVersion
     end
 end
 
+function Base.parse(::Type{SMVersion}, s::AbstractString)
+    m = match(r"^(\d+)\.(\d+)([af]?)$", s)
+    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"")
+    major = parse(Int, m.captures[1])
+    minor = parse(Int, m.captures[2])
+    fs = m.captures[3] == "a" ? :arch :
+         m.captures[3] == "f" ? :family : :baseline
+    return SMVersion(major, minor, fs)
+end
+
 # Suffix on the LLVM CPU name / `.target` directive
 suffix(sm::SMVersion) = sm.feature_set === :arch    ? "a" :
                         sm.feature_set === :family  ? "f" : ""
@@ -58,16 +68,6 @@ end
 
 Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"")
 
-function _parse_sm(s::AbstractString)
-    m = match(r"^(\d+)\.(\d+)([af]?)$", s)
-    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"")
-    major = parse(Int, m.captures[1])
-    minor = parse(Int, m.captures[2])
-    fs = m.captures[3] == "a" ? :arch :
-         m.captures[3] == "f" ? :family : :baseline
-    return SMVersion(major, minor, fs)
-end
-
 macro sm_str(s)
-    return _parse_sm(s)
+    return :(Base.parse($SMVersion, $(esc(s))))
 end
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index ffac61be81..d818741d2c 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -288,9 +288,9 @@ end
     # constructor rejects bogus feature_set
     @test_throws ErrorException SMVersion(9, 0, :bogus)
     # macro rejects malformed strings
-    @test_throws ErrorException CUDACore._parse_sm("103a")     # missing dot
-    @test_throws ErrorException CUDACore._parse_sm("10.3x")    # unknown suffix
-    @test_throws ErrorException CUDACore._parse_sm("10")       # missing minor
+    @test_throws ErrorException parse(SMVersion, "103a")     # missing dot
+    @test_throws ErrorException parse(SMVersion, "10.3x")    # unknown suffix
+    @test_throws ErrorException parse(SMVersion, "10")       # missing minor
 end
 
 @testset "CUDACompilerParams hash discriminates on feature_set" begin

From 8250be81d7c0f02f1f762744cdfcd1c01548c689 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 14:53:08 +0200
Subject: [PATCH 11/16] Add docs.

---
 CUDACore/src/compiler/sm.jl | 72 ++++++++++++++++++++++++++++---------
 docs/src/api/compiler.md    |  8 +++++
 2 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
index c29f2783a3..eeebe68f73 100644
--- a/CUDACore/src/compiler/sm.jl
+++ b/CUDACore/src/compiler/sm.jl
@@ -1,21 +1,45 @@
-# SMVersion: a PTX compilation target, identifying a CUDA compute capability together
-# with its subtarget feature set.
-#
-# Constructed via the `sm"..."` string macro, which accepts the dotted form used in
-# CUDA's documentation: `sm"10.3a"` is compute capability 10.3 with architecture-
-# accelerated features. The bare form `sm"10.3"` is the default, forward-compatible
-# feature set (the "onion model").
-#
-# See `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature
-# definitions, and CUDA's PTX ISA documentation under `.target` for the runtime
-# compatibility implications:
-#
-#   :baseline (no suffix)   - forward-compatible (sm_X for any sm_Y >= X)
-#   :family   ('f' suffix)  - same-major-family-portable
-#   :arch     ('a' suffix)  - locked to one exact CC
-
 export SMVersion, @sm_str
 
+"""
+    SMVersion(major, minor, [feature_set])
+
+A PTX compilation target, identifying a CUDA compute capability together with the
+subtarget feature set selected by the suffix on its `.target` directive.
+
+`feature_set` is one of:
+
+- `:baseline` (no suffix, e.g. `sm_90`) — forward-compatible (the "onion model"):
+  PTX compiled for `sm_X` runs on any `sm_Y` with `Y >= X`.
+- `:family` (`f` suffix, e.g. `sm_100f`) — same-major-family-portable: PTX runs on
+  any device in the same architecture family (currently == same major version) at
+  or above this CC.
+- `:arch` (`a` suffix, e.g. `sm_90a`) — locked to one exact CC: PTX runs only on
+  devices with exactly this compute capability, but in exchange gets access to
+  architecture-accelerated features.
+
+See NVIDIA's PTX ISA reference under `.target` for the full compatibility rules,
+and `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature
+definitions.
+
+Public fields:
+- `sm.major::Int`
+- `sm.minor::Int`
+- `sm.feature_set::Symbol`
+
+See also [`@sm_str`](@ref) for an ergonomic string-macro constructor.
+
+# Examples
+```julia
+julia> SMVersion(9, 0)            # baseline
+sm"9.0"
+
+julia> SMVersion(9, 0, :arch)
+sm"9.0a"
+
+julia> sm"10.0f" == SMVersion(10, 0, :family)
+true
+```
+"""
 struct SMVersion
     major::Int
     minor::Int
@@ -68,6 +92,22 @@ end
 
 Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"")
 
+"""
+    @sm_str
+
+String macro used to parse a string to an [`SMVersion`](@ref). Accepts the dotted form
+used in NVIDIA's PTX ISA reference: `sm"9.0"` for baseline, `sm"9.0a"` for
+architecture-accelerated, `sm"10.0f"` for family-specific.
+
+# Examples
+```julia
+julia> sm"10.3a"
+sm"10.3a"
+
+julia> sm"10.0f" == SMVersion(10, 0, :family)
+true
+```
+"""
 macro sm_str(s)
     return :(Base.parse($SMVersion, $(esc(s))))
 end
diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md
index a7ce178a3e..a2c493d789 100644
--- a/docs/src/api/compiler.md
+++ b/docs/src/api/compiler.md
@@ -25,6 +25,14 @@ registers
 memory
 ```
 
+The PTX compilation target is identified by an `SMVersion`, constructed via the
+`sm"..."` string macro:
+
+```@docs
+SMVersion
+@sm_str
+```
+
 To plug in alternative compiler back-ends (e.g. cuTile.jl), `@cuda` dispatches
 through a small protocol:
 

From aacd3b432ef256765af599270b514c620403935e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 14:59:06 +0200
Subject: [PATCH 12/16] Switch to a more NVIDIA-like SM string.

---
 CUDACore/src/compatibility.jl      | 150 ++++++++++++++---------------
 CUDACore/src/compiler/execution.jl |   4 +-
 CUDACore/src/compiler/sm.jl        |  34 ++++---
 test/core/codegen.jl               |  60 ++++++------
 test/core/execution.jl             |   6 +-
 5 files changed, 131 insertions(+), 123 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index 71a0dfc0f0..f8190d39cf 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -151,45 +151,45 @@ end
 # Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants
 # were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8.
 const ptx_cap_db = Dict{SMVersion, VersionRange}(
-    sm"1.0"   => between(v"1.0", highest),
-    sm"1.1"   => between(v"1.0", highest),
-    sm"1.2"   => between(v"1.2", highest),
-    sm"1.3"   => between(v"1.2", highest),
-    sm"2.0"   => between(v"2.0", highest),
-    sm"3.0"   => between(v"3.1", highest),
-    sm"3.2"   => between(v"4.0", highest),
-    sm"3.5"   => between(v"3.1", highest),
-    sm"3.7"   => between(v"4.1", highest),
-    sm"5.0"   => between(v"4.0", highest),
-    sm"5.2"   => between(v"4.1", highest),
-    sm"5.3"   => between(v"4.2", highest),
-    sm"6.0"   => between(v"5.0", highest),
-    sm"6.1"   => between(v"5.0", highest),
-    sm"6.2"   => between(v"5.0", highest),
-    sm"7.0"   => between(v"6.0", highest),
-    sm"7.2"   => between(v"6.1", highest),
-    sm"7.5"   => between(v"6.3", highest),
-    sm"8.0"   => between(v"7.0", highest),
-    sm"8.6"   => between(v"7.1", highest),
-    sm"8.7"   => between(v"7.4", highest),
-    sm"8.9"   => between(v"7.8", highest),
-    sm"9.0"   => between(v"7.8", highest),
-    sm"9.0a"  => between(v"8.0", highest),
-    sm"10.0"  => between(v"8.6", highest),
-    sm"10.0a" => between(v"8.6", highest),
-    sm"10.0f" => between(v"8.8", highest),
-    sm"10.1"  => between(v"8.6", highest),
-    sm"10.1a" => between(v"8.6", highest),
-    sm"10.1f" => between(v"8.8", highest),
-    sm"10.3"  => between(v"8.8", highest),
-    sm"10.3a" => between(v"8.8", highest),
-    sm"10.3f" => between(v"8.8", highest),
-    sm"12.0"  => between(v"8.7", highest),
-    sm"12.0a" => between(v"8.7", highest),
-    sm"12.0f" => between(v"8.8", highest),
-    sm"12.1"  => between(v"8.8", highest),
-    sm"12.1a" => between(v"8.8", highest),
-    sm"12.1f" => between(v"8.8", highest),
+    sm"10"   => between(v"1.0", highest),
+    sm"11"   => between(v"1.0", highest),
+    sm"12"   => between(v"1.2", highest),
+    sm"13"   => between(v"1.2", highest),
+    sm"20"   => between(v"2.0", highest),
+    sm"30"   => between(v"3.1", highest),
+    sm"32"   => between(v"4.0", highest),
+    sm"35"   => between(v"3.1", highest),
+    sm"37"   => between(v"4.1", highest),
+    sm"50"   => between(v"4.0", highest),
+    sm"52"   => between(v"4.1", highest),
+    sm"53"   => between(v"4.2", highest),
+    sm"60"   => between(v"5.0", highest),
+    sm"61"   => between(v"5.0", highest),
+    sm"62"   => between(v"5.0", highest),
+    sm"70"   => between(v"6.0", highest),
+    sm"72"   => between(v"6.1", highest),
+    sm"75"   => between(v"6.3", highest),
+    sm"80"   => between(v"7.0", highest),
+    sm"86"   => between(v"7.1", highest),
+    sm"87"   => between(v"7.4", highest),
+    sm"89"   => between(v"7.8", highest),
+    sm"90"   => between(v"7.8", highest),
+    sm"90a"  => between(v"8.0", highest),
+    sm"100"  => between(v"8.6", highest),
+    sm"100a" => between(v"8.6", highest),
+    sm"100f" => between(v"8.8", highest),
+    sm"101"  => between(v"8.6", highest),
+    sm"101a" => between(v"8.6", highest),
+    sm"101f" => between(v"8.8", highest),
+    sm"103"  => between(v"8.8", highest),
+    sm"103a" => between(v"8.8", highest),
+    sm"103f" => between(v"8.8", highest),
+    sm"120"  => between(v"8.7", highest),
+    sm"120a" => between(v"8.7", highest),
+    sm"120f" => between(v"8.8", highest),
+    sm"121"  => between(v"8.8", highest),
+    sm"121a" => between(v"8.8", highest),
+    sm"121f" => between(v"8.8", highest),
 )
 
 # Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`.
@@ -210,42 +210,42 @@ end
 # here as a separate entry; without an entry LLVM does not know the variant CPU name and
 # constructing a TargetMachine with it would fall back to a generic subtarget.
 const llvm_cap_db = Dict{SMVersion, VersionRange}(
-    sm"2.0"   => between(v"3.2", highest),
-    sm"2.1"   => between(v"3.2", highest),
-    sm"3.0"   => between(v"3.2", highest),
-    sm"3.2"   => between(v"3.7", highest),
-    sm"3.5"   => between(v"3.2", highest),
-    sm"3.7"   => between(v"3.7", highest),
-    sm"5.0"   => between(v"3.5", highest),
-    sm"5.2"   => between(v"3.7", highest),
-    sm"5.3"   => between(v"3.7", highest),
-    sm"6.0"   => between(v"3.9", highest),
-    sm"6.1"   => between(v"3.9", highest),
-    sm"6.2"   => between(v"3.9", highest),
-    sm"7.0"   => between(v"6", highest),
-    sm"7.2"   => between(v"7", highest),
-    sm"7.5"   => between(v"8", highest),
-    sm"8.0"   => between(v"11", highest),
-    sm"8.6"   => between(v"13", highest),
-    sm"8.7"   => between(v"16", highest),
-    sm"8.9"   => between(v"16", highest),
-    sm"9.0"   => between(v"16", highest),
-    sm"9.0a"  => between(v"18", highest),
-    sm"10.0"  => between(v"20", highest),
-    sm"10.0a" => between(v"20", highest),
-    sm"10.0f" => between(v"21", highest),
-    sm"10.1"  => between(v"20", highest),
-    sm"10.1a" => between(v"20", highest),
-    sm"10.1f" => between(v"21", highest),
-    sm"10.3"  => between(v"21", highest),
-    sm"10.3a" => between(v"21", highest),
-    sm"10.3f" => between(v"21", highest),
-    sm"12.0"  => between(v"20", highest),
-    sm"12.0a" => between(v"20", highest),
-    sm"12.0f" => between(v"21", highest),
-    sm"12.1"  => between(v"21", highest),
-    sm"12.1a" => between(v"21", highest),
-    sm"12.1f" => between(v"21", highest),
+    sm"20"   => between(v"3.2", highest),
+    sm"21"   => between(v"3.2", highest),
+    sm"30"   => between(v"3.2", highest),
+    sm"32"   => between(v"3.7", highest),
+    sm"35"   => between(v"3.2", highest),
+    sm"37"   => between(v"3.7", highest),
+    sm"50"   => between(v"3.5", highest),
+    sm"52"   => between(v"3.7", highest),
+    sm"53"   => between(v"3.7", highest),
+    sm"60"   => between(v"3.9", highest),
+    sm"61"   => between(v"3.9", highest),
+    sm"62"   => between(v"3.9", highest),
+    sm"70"   => between(v"6", highest),
+    sm"72"   => between(v"7", highest),
+    sm"75"   => between(v"8", highest),
+    sm"80"   => between(v"11", highest),
+    sm"86"   => between(v"13", highest),
+    sm"87"   => between(v"16", highest),
+    sm"89"   => between(v"16", highest),
+    sm"90"   => between(v"16", highest),
+    sm"90a"  => between(v"18", highest),
+    sm"100"  => between(v"20", highest),
+    sm"100a" => between(v"20", highest),
+    sm"100f" => between(v"21", highest),
+    sm"101"  => between(v"20", highest),
+    sm"101a" => between(v"20", highest),
+    sm"101f" => between(v"21", highest),
+    sm"103"  => between(v"21", highest),
+    sm"103a" => between(v"21", highest),
+    sm"103f" => between(v"21", highest),
+    sm"120"  => between(v"20", highest),
+    sm"120a" => between(v"20", highest),
+    sm"120f" => between(v"21", highest),
+    sm"121"  => between(v"21", highest),
+    sm"121a" => between(v"21", highest),
+    sm"121f" => between(v"21", highest),
 )
 
 # Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`.
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index a67a3f61ed..f42de56402 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -435,8 +435,8 @@ The following keyword arguments are supported:
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for.
   `cap` accepts an [`SMVersion`](@ref) via the `sm"..."` string macro, e.g.
-  `cap=sm"10.3a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"10.0f"`
-  for family-portable Blackwell codegen. The bare form `cap=sm"10.3"` selects baseline
+  `cap=sm"103a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"100f"`
+  for family-portable Blackwell codegen. The bare form `cap=sm"103"` selects baseline
   (forward-compatible) codegen. Passing a `VersionNumber` is deprecated.
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
index eeebe68f73..1b3cdb9734 100644
--- a/CUDACore/src/compiler/sm.jl
+++ b/CUDACore/src/compiler/sm.jl
@@ -4,7 +4,11 @@ export SMVersion, @sm_str
     SMVersion(major, minor, [feature_set])
 
 A PTX compilation target, identifying a CUDA compute capability together with the
-subtarget feature set selected by the suffix on its `.target` directive.
+subtarget feature set selected by the suffix on its `.target` directive. Printed and
+parsed in NVIDIA's compact form -- `sm"90"` for compute capability 9.0, `sm"103a"`
+for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]`
+notation in the PTX ISA reference and to distinguish visually from a device-level
+[`VersionNumber`](@ref) like `v"9.0"`.
 
 `feature_set` is one of:
 
@@ -31,12 +35,12 @@ See also [`@sm_str`](@ref) for an ergonomic string-macro constructor.
 # Examples
 ```julia
 julia> SMVersion(9, 0)            # baseline
-sm"9.0"
+sm"90"
 
 julia> SMVersion(9, 0, :arch)
-sm"9.0a"
+sm"90a"
 
-julia> sm"10.0f" == SMVersion(10, 0, :family)
+julia> sm"100f" == SMVersion(10, 0, :family)
 true
 ```
 """
@@ -53,8 +57,11 @@ struct SMVersion
 end
 
 function Base.parse(::Type{SMVersion}, s::AbstractString)
-    m = match(r"^(\d+)\.(\d+)([af]?)$", s)
-    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"")
+    # Mirrors NVIDIA's `sm_NN[a|f]` notation: the last digit before the optional suffix
+    # is the minor, everything before it is the major. Always one minor digit (NVIDIA
+    # has never minted a CC with minor >= 10, and rolls the major over instead).
+    m = match(r"^(\d+)(\d)([af]?)$", s)
+    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"103a\", or \"100f\"")
     major = parse(Int, m.captures[1])
     minor = parse(Int, m.captures[2])
     fs = m.captures[3] == "a" ? :arch :
@@ -90,21 +97,22 @@ function runs_on(sm::SMVersion, dev_cap::VersionNumber)
 end
 
 
-Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"")
+Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, sm.minor, suffix(sm), "\"")
 
 """
     @sm_str
 
-String macro used to parse a string to an [`SMVersion`](@ref). Accepts the dotted form
-used in NVIDIA's PTX ISA reference: `sm"9.0"` for baseline, `sm"9.0a"` for
-architecture-accelerated, `sm"10.0f"` for family-specific.
+String macro used to parse a string to an [`SMVersion`](@ref). Accepts NVIDIA's
+compact `sm_NN[a|f]` notation (without the `sm_` prefix): `sm"90"` for baseline,
+`sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific. The last
+digit before the optional suffix is the minor; everything before it is the major.
 
 # Examples
 ```julia
-julia> sm"10.3a"
-sm"10.3a"
+julia> sm"103a"
+sm"103a"
 
-julia> sm"10.0f" == SMVersion(10, 0, :family)
+julia> sm"100f" == SMVersion(10, 0, :family)
 true
 ```
 """
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index d818741d2c..de07b39ce9 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -255,36 +255,36 @@ end
 
     @test !success(run_ptxas(asm_pre, "sm_75"))
 
-    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0")
+    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90")
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
 
     # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
     # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
-    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0a")
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90a")
     @test occursin(".target sm_90a", asm_arch)
     @test success(run_ptxas(asm_arch, "sm_90a"))
 
     # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
-    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"10.0f")
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"100f")
     @test occursin(".target sm_100f", asm_family)
     @test success(run_ptxas(asm_family, "sm_100f"))
 end
 
 @testset "SMVersion and sm\"...\" macro" begin
-    @test sm"9.0"   == SMVersion(9, 0, :baseline)
-    @test sm"9.0a"  == SMVersion(9, 0, :arch)
-    @test sm"10.0f" == SMVersion(10, 0, :family)
+    @test sm"90"   == SMVersion(9, 0, :baseline)
+    @test sm"90a"  == SMVersion(9, 0, :arch)
+    @test sm"100f" == SMVersion(10, 0, :family)
     # printing roundtrips via the macro form
-    @test sprint(show, sm"10.3a") == "sm\"10.3a\""
-    @test sprint(show, sm"10.0")  == "sm\"10.0\""
+    @test sprint(show, sm"103a") == "sm\"103a\""
+    @test sprint(show, sm"100")  == "sm\"100\""
     # cpu_name reflects feature_set
-    @test CUDACore.cpu_name(sm"9.0")   == "sm_90"
-    @test CUDACore.cpu_name(sm"9.0a")  == "sm_90a"
-    @test CUDACore.cpu_name(sm"10.0f") == "sm_100f"
+    @test CUDACore.cpu_name(sm"90")   == "sm_90"
+    @test CUDACore.cpu_name(sm"90a")  == "sm_90a"
+    @test CUDACore.cpu_name(sm"100f") == "sm_100f"
     # base_version drops the suffix back to a comparable VersionNumber
-    @test CUDACore.base_version(sm"10.3a") == v"10.3"
+    @test CUDACore.base_version(sm"103a") == v"10.3"
     # constructor rejects bogus feature_set
     @test_throws ErrorException SMVersion(9, 0, :bogus)
     # macro rejects malformed strings
@@ -296,33 +296,33 @@ end
 @testset "CUDACompilerParams hash discriminates on feature_set" begin
     # Without feature_set in the hash, two params differing only on feature_set would collide
     # in the compiler cache and silently return a cubin compiled for the wrong feature set.
-    base = CUDACore.CUDACompilerParams(sm=sm"9.0",  ptx=v"8.0")
-    arch = CUDACore.CUDACompilerParams(sm=sm"9.0a", ptx=v"8.0")
+    base = CUDACore.CUDACompilerParams(sm=sm"90",  ptx=v"8.0")
+    arch = CUDACore.CUDACompilerParams(sm=sm"90a", ptx=v"8.0")
     @test hash(base) != hash(arch)
     @test base != arch
 end
 
 @testset "ptx_cap_support" begin
-    # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"9.0") and PTX >= 8.0.
-    # Family-specific needs CC >= 10.0 (no `*f` keys below sm"10.0") and PTX >= 8.8.
-    @test !(sm"8.6a" in CUDACore.ptx_cap_support(v"8.0"))   # no `*a` below CC 9.0
-    @test !(sm"9.0a" in CUDACore.ptx_cap_support(v"7.8"))   # `*a` requires PTX >= 8.0
-    @test !(sm"9.0f" in CUDACore.ptx_cap_support(v"8.0"))   # no `*f` below CC 10.0
-    @test !(sm"10.0f" in CUDACore.ptx_cap_support(v"8.7"))  # `*f` requires PTX >= 8.8
-    @test  sm"9.0a"  in CUDACore.ptx_cap_support(v"8.0")
-    @test  sm"10.0f" in CUDACore.ptx_cap_support(v"8.8")
-    @test  sm"5.0"   in CUDACore.ptx_cap_support(v"6.2")
+    # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"90") and PTX >= 8.0.
+    # Family-specific needs CC >= 10.0 (no `*f` keys below sm"100") and PTX >= 8.8.
+    @test !(sm"86a" in CUDACore.ptx_cap_support(v"8.0"))   # no `*a` below CC 9.0
+    @test !(sm"90a" in CUDACore.ptx_cap_support(v"7.8"))   # `*a` requires PTX >= 8.0
+    @test !(sm"90f" in CUDACore.ptx_cap_support(v"8.0"))   # no `*f` below CC 10.0
+    @test !(sm"100f" in CUDACore.ptx_cap_support(v"8.7"))  # `*f` requires PTX >= 8.8
+    @test  sm"90a"  in CUDACore.ptx_cap_support(v"8.0")
+    @test  sm"100f" in CUDACore.ptx_cap_support(v"8.8")
+    @test  sm"50"   in CUDACore.ptx_cap_support(v"6.2")
 end
 
 @testset "llvm_cap_support" begin
     # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td.
-    @test  sm"10.3a" in CUDACore.llvm_cap_support(v"21")
-    @test  sm"10.0f" in CUDACore.llvm_cap_support(v"21")
-    @test  sm"9.0a"  in CUDACore.llvm_cap_support(v"18")
-    @test !(sm"9.0a"  in CUDACore.llvm_cap_support(v"17"))  # sm_90a added in LLVM 18
-    @test !(sm"10.3a" in CUDACore.llvm_cap_support(v"20"))  # sm_103a added in LLVM 21
-    @test !(sm"10.0f" in CUDACore.llvm_cap_support(v"20"))  # sm_100f added in LLVM 21
-    @test  sm"7.0"   in CUDACore.llvm_cap_support(v"15")
+    @test  sm"103a" in CUDACore.llvm_cap_support(v"21")
+    @test  sm"100f" in CUDACore.llvm_cap_support(v"21")
+    @test  sm"90a"  in CUDACore.llvm_cap_support(v"18")
+    @test !(sm"90a"  in CUDACore.llvm_cap_support(v"17"))  # sm_90a added in LLVM 18
+    @test !(sm"103a" in CUDACore.llvm_cap_support(v"20"))  # sm_103a added in LLVM 21
+    @test !(sm"100f" in CUDACore.llvm_cap_support(v"20"))  # sm_100f added in LLVM 21
+    @test  sm"70"   in CUDACore.llvm_cap_support(v"15")
 end
 
 end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index ddb871b936..a3e5530aa9 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -50,15 +50,15 @@ end
     @cuda threads=2 dummy()
 
     # sm_10 isn't supported by LLVM
-    @test_throws "not supported by LLVM" @cuda launch=false cap=sm"1.0" dummy()
+    @test_throws "not supported by LLVM" @cuda launch=false cap=sm"10" dummy()
     # sm_20 is, but not by any CUDA version we support
-    @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"2.0" dummy()
+    @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"20" dummy()
     # there isn't any capability other than the device's that's guaruanteed to work
     dev_cap = capability(device())
     dev_sm = SMVersion(dev_cap.major, dev_cap.minor)
     @cuda launch=false cap=dev_sm dummy()
     # but we should be able to see it in the generated PTX code
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"5.0"))
+    asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50"))
     @test contains(asm, ".target sm_50")
 
     # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the

From 17bd4c8b2a72ac4ecab94be0ae2f62503867026d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 16:04:00 +0200
Subject: [PATCH 13/16] More cleanly separate device capability and codegen
 architecture.

---
 CUDACore/src/compatibility.jl        | 19 ++++++++------
 CUDACore/src/compiler/compilation.jl | 34 ++++++++++++++----------
 CUDACore/src/compiler/execution.jl   | 13 +++++-----
 CUDACore/src/device/runtime.jl       |  2 +-
 CUDACore/src/precompile.jl           |  6 ++---
 test/core/codegen.jl                 | 39 ++++++++++++++--------------
 test/core/execution.jl               | 32 +++++++++++++----------
 7 files changed, 81 insertions(+), 64 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index f8190d39cf..0f512a0255 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -16,7 +16,7 @@ const highest = v"999"
 #     exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
 #
 # Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
-# require for them, is encoded directly in the keys of `ptx_cap_db` and `llvm_cap_db`
+# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
 # below: an unsupported combination simply has no entry.
 
 
@@ -150,7 +150,7 @@ end
 
 # Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants
 # were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8.
-const ptx_cap_db = Dict{SMVersion, VersionRange}(
+const ptx_sm_db = Dict{SMVersion, VersionRange}(
     sm"10"   => between(v"1.0", highest),
     sm"11"   => between(v"1.0", highest),
     sm"12"   => between(v"1.2", highest),
@@ -193,9 +193,9 @@ const ptx_cap_db = Dict{SMVersion, VersionRange}(
 )
 
 # Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`.
-function ptx_cap_support(ver::VersionNumber)
+function ptx_sm_support(ver::VersionNumber)
     caps = Set{SMVersion}()
-    for (cap, r) in ptx_cap_db
+    for (cap, r) in ptx_sm_db
         if ver in r
             push!(caps, cap)
         end
@@ -209,7 +209,7 @@ end
 # Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up
 # here as a separate entry; without an entry LLVM does not know the variant CPU name and
 # constructing a TargetMachine with it would fall back to a generic subtarget.
-const llvm_cap_db = Dict{SMVersion, VersionRange}(
+const llvm_sm_db = Dict{SMVersion, VersionRange}(
     sm"20"   => between(v"3.2", highest),
     sm"21"   => between(v"3.2", highest),
     sm"30"   => between(v"3.2", highest),
@@ -249,9 +249,9 @@ const llvm_cap_db = Dict{SMVersion, VersionRange}(
 )
 
 # Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`.
-function llvm_cap_support(ver::VersionNumber)
+function llvm_sm_support(ver::VersionNumber)
     caps = Set{SMVersion}()
-    for (cap, r) in llvm_cap_db
+    for (cap, r) in llvm_sm_db
         if ver in r
             push!(caps, cap)
         end
@@ -315,7 +315,10 @@ end
 function llvm_compat(version=LLVM.version())
     LLVM.InitializeNVPTXTarget()
 
-    return (cap=llvm_cap_support(version),
+    # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
+    # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
+    # support is per-CC -- the names track the value type.
+    return (sm=llvm_sm_support(version),
             ptx=llvm_ptx_support(version))
 end
 
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 32ff9f3375..dc296a1ca1 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -186,13 +186,21 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
-    # convert / deprecate VersionNumber cap
-    if cap isa VersionNumber
-        Base.depwarn("Passing a VersionNumber to `cap=` is deprecated; use the `sm\"$(cap.major).$(cap.minor)\"` string macro instead.",
+                                         arch=nothing, cap=nothing, ptx=nothing, kwargs...)
+    # `cap=` is the deprecated old name for `arch=` (matches nvcc/ptxas `-arch`).
+    if cap !== nothing
+        arch === nothing ||
+            throw(ArgumentError("pass either `arch=` or the deprecated `cap=`, not both"))
+        Base.depwarn("the `cap=` kwarg is deprecated; use `arch=` (matching nvcc/ptxas `-arch`) instead.",
                      :cufunction)
-        cap = SMVersion(cap.major, cap.minor, :baseline)
+        arch = cap
+    end
+    # `arch=` accepts a plain `VersionNumber` (treated as baseline) or an `SMVersion`.
+    if arch isa VersionNumber
+        arch = SMVersion(arch.major, arch.minor, :baseline)
     end
+    arch === nothing || arch isa SMVersion ||
+        throw(ArgumentError("`arch=` must be a VersionNumber, SMVersion, or nothing; got $(typeof(arch))"))
 
     # inspect the toolchain
     llvm_support = llvm_compat()
@@ -226,29 +234,29 @@ end
 
     # determine the compute capability to use.
     ## ptxas
-    ptx_caps = ptx_cap_support(ptxas_ptx)
-    if cap !== nothing
+    ptx_sms = ptx_sm_support(ptxas_ptx)
+    if arch !== nothing
         # explicit request: take it as-is, validating against the PTX ISA
-        cap in ptx_caps ||
-            error("$(cpu_name(cap)) is not supported by PTX ISA $(ptxas_ptx)")
-        ptxas_sm = cap
+        arch in ptx_sms ||
+            error("$(cpu_name(arch)) is not supported by PTX ISA $(ptxas_ptx)")
+        ptxas_sm = arch
     else
         # pick the most specific capability the selected PTX ISA supports whose cubin
         # would actually load on the current device. For baseline that's the onion model;
         # `:arch` requires an exact CC match, `:family` a same-family match.
-        ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_caps)
+        ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_sms)
         isempty(ptxas_candidates) &&
             error("Compute capability $(capability(dev)) is not supported by ptxas " *
                   "$(compiler_version()) at PTX ISA $(ptxas_ptx)")
         ptxas_sm = argmax(sm_key, ptxas_candidates)
     end
     ## LLVM
-    if ptxas_sm in llvm_support.cap
+    if ptxas_sm in llvm_support.sm
         llvm_sm = ptxas_sm
     else
         # Exact `ptxas_sm` unavailable in LLVM. Fall back to baseline LLVM at a
         # lower base, since arch/family features don't carry across versions.
-        baseline_candidates = filter(llvm_support.cap) do sm
+        baseline_candidates = filter(llvm_support.sm) do sm
             sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm)
         end
         isempty(baseline_candidates) &&
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index f42de56402..bfb7d963ce 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :arch, :cap, :ptx]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
 
@@ -433,11 +433,12 @@ The following keyword arguments are supported:
 - `name`: override the name that the kernel will have in the generated code
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
-- `cap` and `ptx`: to override the compute capability and PTX version to compile for.
-  `cap` accepts an [`SMVersion`](@ref) via the `sm"..."` string macro, e.g.
-  `cap=sm"103a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"100f"`
-  for family-portable Blackwell codegen. The bare form `cap=sm"103"` selects baseline
-  (forward-compatible) codegen. Passing a `VersionNumber` is deprecated.
+- `arch` and `ptx`: override the GPU architecture (matching nvcc/ptxas `-arch`) and the
+  PTX ISA version to compile for. `arch` accepts either an [`SMVersion`](@ref) via the
+  `sm"..."` string macro (e.g. `arch=sm"103a"` for architecture-accelerated codegen on
+  CC 10.3, or `arch=sm"100f"` for family-portable Blackwell codegen) or a `VersionNumber`
+  (e.g. `arch=v"10.3"`, treated as baseline / forward-compatible). The old kwarg name
+  `cap=` is accepted as a deprecated alias.
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl
index 93e0cd90c5..6739d27d68 100644
--- a/CUDACore/src/device/runtime.jl
+++ b/CUDACore/src/device/runtime.jl
@@ -14,7 +14,7 @@ function precompile_runtime()
 
     # `.cap` is now keyed by `SMVersion` and includes variants; runtime caches are
     # feature_set-agnostic, so we only warm the baseline entries.
-    sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().cap)
+    sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().sm)
     ptx = maximum(llvm_compat().ptx)
     JuliaContext() do ctx
         for sm in sms, debuginfo in [false, true]
diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl
index a5c39b1cd8..8817aa74cc 100644
--- a/CUDACore/src/precompile.jl
+++ b/CUDACore/src/precompile.jl
@@ -13,12 +13,12 @@ if :NVPTX in LLVM.backends()
             end
 
             llvm_support = llvm_compat()
-            # `.cap` is keyed by `SMVersion` and includes variants; pick the highest
-            # baseline cap <= v"7.5" for a portable precompile artifact.
+            # `.sm` is `Set{SMVersion}` (with variants); pick the highest baseline
+            # entry <= v"7.5" for a portable precompile artifact.
             llvm_sm = argmax(base_version,
                              filter(sm -> sm.feature_set === :baseline &&
                                           base_version(sm) <= v"7.5",
-                                    llvm_support.cap))
+                                    llvm_support.sm))
             llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))
 
             target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, debuginfo=true)
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index de07b39ce9..30e472cd08 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -288,9 +288,10 @@ end
     # constructor rejects bogus feature_set
     @test_throws ErrorException SMVersion(9, 0, :bogus)
     # macro rejects malformed strings
-    @test_throws ErrorException parse(SMVersion, "103a")     # missing dot
-    @test_throws ErrorException parse(SMVersion, "10.3x")    # unknown suffix
-    @test_throws ErrorException parse(SMVersion, "10")       # missing minor
+    @test_throws ErrorException parse(SMVersion, "10.3a")    # dotted form (NVIDIA uses dotless)
+    @test_throws ErrorException parse(SMVersion, "100x")     # unknown suffix
+    @test_throws ErrorException parse(SMVersion, "1")        # only one digit (need at least major + minor)
+    @test_throws ErrorException parse(SMVersion, "")         # empty
 end
 
 @testset "CUDACompilerParams hash discriminates on feature_set" begin
@@ -302,27 +303,27 @@ end
     @test base != arch
 end
 
-@testset "ptx_cap_support" begin
+@testset "ptx_sm_support" begin
     # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"90") and PTX >= 8.0.
     # Family-specific needs CC >= 10.0 (no `*f` keys below sm"100") and PTX >= 8.8.
-    @test !(sm"86a" in CUDACore.ptx_cap_support(v"8.0"))   # no `*a` below CC 9.0
-    @test !(sm"90a" in CUDACore.ptx_cap_support(v"7.8"))   # `*a` requires PTX >= 8.0
-    @test !(sm"90f" in CUDACore.ptx_cap_support(v"8.0"))   # no `*f` below CC 10.0
-    @test !(sm"100f" in CUDACore.ptx_cap_support(v"8.7"))  # `*f` requires PTX >= 8.8
-    @test  sm"90a"  in CUDACore.ptx_cap_support(v"8.0")
-    @test  sm"100f" in CUDACore.ptx_cap_support(v"8.8")
-    @test  sm"50"   in CUDACore.ptx_cap_support(v"6.2")
+    @test !(sm"86a" in CUDACore.ptx_sm_support(v"8.0"))   # no `*a` below CC 9.0
+    @test !(sm"90a" in CUDACore.ptx_sm_support(v"7.8"))   # `*a` requires PTX >= 8.0
+    @test !(sm"90f" in CUDACore.ptx_sm_support(v"8.0"))   # no `*f` below CC 10.0
+    @test !(sm"100f" in CUDACore.ptx_sm_support(v"8.7"))  # `*f` requires PTX >= 8.8
+    @test  sm"90a"  in CUDACore.ptx_sm_support(v"8.0")
+    @test  sm"100f" in CUDACore.ptx_sm_support(v"8.8")
+    @test  sm"50"   in CUDACore.ptx_sm_support(v"6.2")
 end
 
-@testset "llvm_cap_support" begin
+@testset "llvm_sm_support" begin
     # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td.
-    @test  sm"103a" in CUDACore.llvm_cap_support(v"21")
-    @test  sm"100f" in CUDACore.llvm_cap_support(v"21")
-    @test  sm"90a"  in CUDACore.llvm_cap_support(v"18")
-    @test !(sm"90a"  in CUDACore.llvm_cap_support(v"17"))  # sm_90a added in LLVM 18
-    @test !(sm"103a" in CUDACore.llvm_cap_support(v"20"))  # sm_103a added in LLVM 21
-    @test !(sm"100f" in CUDACore.llvm_cap_support(v"20"))  # sm_100f added in LLVM 21
-    @test  sm"70"   in CUDACore.llvm_cap_support(v"15")
+    @test  sm"103a" in CUDACore.llvm_sm_support(v"21")
+    @test  sm"100f" in CUDACore.llvm_sm_support(v"21")
+    @test  sm"90a"  in CUDACore.llvm_sm_support(v"18")
+    @test !(sm"90a"  in CUDACore.llvm_sm_support(v"17"))  # sm_90a added in LLVM 18
+    @test !(sm"103a" in CUDACore.llvm_sm_support(v"20"))  # sm_103a added in LLVM 21
+    @test !(sm"100f" in CUDACore.llvm_sm_support(v"20"))  # sm_100f added in LLVM 21
+    @test  sm"70"   in CUDACore.llvm_sm_support(v"15")
 end
 
 end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index a3e5530aa9..387af3547e 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -50,15 +50,20 @@ end
     @cuda threads=2 dummy()
 
     # sm_10 isn't supported by LLVM
-    @test_throws "not supported by LLVM" @cuda launch=false cap=sm"10" dummy()
+    @test_throws "not supported by LLVM" @cuda launch=false arch=sm"10" dummy()
     # sm_20 is, but not by any CUDA version we support
-    @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"20" dummy()
+    @test_throws "Failed to compile PTX code" @cuda launch=false arch=sm"20" dummy()
     # there isn't any capability other than the device's that's guaruanteed to work
     dev_cap = capability(device())
     dev_sm = SMVersion(dev_cap.major, dev_cap.minor)
-    @cuda launch=false cap=dev_sm dummy()
+    @cuda launch=false arch=dev_sm dummy()
+    # `arch=` also accepts a plain `VersionNumber` -- treated as baseline. Equivalent
+    # to constructing the SMVersion directly.
+    @cuda launch=false arch=dev_cap dummy()
     # but we should be able to see it in the generated PTX code
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50"))
+    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50"))
+    @test contains(asm, ".target sm_50")
+    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0"))
     @test contains(asm, ".target sm_50")
 
     # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the
@@ -79,23 +84,22 @@ end
     sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
 
     if dev_cap >= v"9.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_a))
+        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a))
         @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
         # arch-specific cubin should also actually launch on the matching device
-        @cuda cap=sm_a dummy()
+        @cuda arch=sm_a dummy()
     end
     if dev_cap >= v"10.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_f))
+        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f))
         @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))")
-        @cuda cap=sm_f dummy()
+        @cuda arch=sm_f dummy()
     end
 
-    # passing a VersionNumber to `cap` is deprecated; check the depwarn fires while
-    # the path still produces the right PTX. (Uses code_ptx to skip ptxas, which on
-    # newer CUDA toolkits no longer accepts sm_50.)
-    @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0"))
+    # `cap=` is the deprecated alias for `arch=`; check the depwarn fires while
+    # the path still produces the right PTX.
+    @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50"))
 
-    # With no explicit `cap=`, we default to architecture-specific code paths on CC >=9.0
+    # With no explicit `arch=`, we default to architecture-specific code paths on CC >=9.0
     # since we know the exact device. The cuda-side `.target` is the variant regardless of
     # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs.
     if dev_cap >= v"9.0"
@@ -121,7 +125,7 @@ end
     @cuda threads=1 read_feature_set!(out)
     # arch features come through `target_feature_set()` only when LLVM natively supported
     # the variant; otherwise we fell back to baseline LLVM and the global reflects that.
-    arch_in_llvm = sm_a in CUDACore.llvm_cap_support(CUDACore.LLVM.version())
+    arch_in_llvm = sm_a in CUDACore.llvm_sm_support(CUDACore.LLVM.version())
     expected = dev_cap >= v"9.0" && arch_in_llvm ? UInt32(2) : UInt32(0)
     @test Array(out)[1] == expected
 end

From aa9ae2e98987612898a3704ab10bce604ae4ef28 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 16:12:42 +0200
Subject: [PATCH 14/16] Also support string-based version numbers with sm_
 prefixes.

---
 CUDACore/src/compiler/compilation.jl |  9 ++----
 CUDACore/src/compiler/sm.jl          | 41 ++++++++++++++++++++++------
 test/core/codegen.jl                 |  8 ++++++
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index dc296a1ca1..bee91fbca1 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -195,12 +195,9 @@ end
                      :cufunction)
         arch = cap
     end
-    # `arch=` accepts a plain `VersionNumber` (treated as baseline) or an `SMVersion`.
-    if arch isa VersionNumber
-        arch = SMVersion(arch.major, arch.minor, :baseline)
-    end
-    arch === nothing || arch isa SMVersion ||
-        throw(ArgumentError("`arch=` must be a VersionNumber, SMVersion, or nothing; got $(typeof(arch))"))
+    # `SMVersion` is the universal normalizer: identity for an SMVersion, baseline-promotes
+    # a VersionNumber, parses a string. Anything else falls out as a MethodError naturally.
+    arch === nothing || (arch = SMVersion(arch))
 
     # inspect the toolchain
     llvm_support = llvm_compat()
diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
index 1b3cdb9734..420e280eea 100644
--- a/CUDACore/src/compiler/sm.jl
+++ b/CUDACore/src/compiler/sm.jl
@@ -2,6 +2,9 @@ export SMVersion, @sm_str
 
 """
     SMVersion(major, minor, [feature_set])
+    SMVersion(s::AbstractString)
+    SMVersion(v::VersionNumber)
+    SMVersion(sm::SMVersion)
 
 A PTX compilation target, identifying a CUDA compute capability together with the
 subtarget feature set selected by the suffix on its `.target` directive. Printed and
@@ -10,6 +13,17 @@ for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]`
 notation in the PTX ISA reference and to distinguish visually from a device-level
 [`VersionNumber`](@ref) like `v"9.0"`.
 
+The single-argument constructors normalize various inputs to an `SMVersion`:
+
+- `SMVersion(::AbstractString)` parses the compact form, with or without the `sm_`
+  prefix (so e.g. `SMVersion("sm_103a")` and `SMVersion("103a")` both work).
+- `SMVersion(::VersionNumber)` promotes a plain compute-capability version to a
+  baseline `SMVersion` (`SMVersion(v"10.3") == SMVersion(10, 3, :baseline)`).
+- `SMVersion(::SMVersion)` is the identity (idempotent).
+
+This is what lets `@cuda arch=...` accept `v"10.3"`, `sm"103a"`, `"sm_103a"`, or
+an already-constructed `SMVersion` interchangeably.
+
 `feature_set` is one of:
 
 - `:baseline` (no suffix, e.g. `sm_90`) — forward-compatible (the "onion model"):
@@ -59,9 +73,12 @@ end
 function Base.parse(::Type{SMVersion}, s::AbstractString)
     # Mirrors NVIDIA's `sm_NN[a|f]` notation: the last digit before the optional suffix
     # is the minor, everything before it is the major. Always one minor digit (NVIDIA
-    # has never minted a CC with minor >= 10, and rolls the major over instead).
-    m = match(r"^(\d+)(\d)([af]?)$", s)
-    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"103a\", or \"100f\"")
+    # has never minted a CC with minor >= 10, and rolls the major over instead). The
+    # optional `sm_` prefix is accepted so PTX-tool output / config strings can pass
+    # straight through.
+    raw = startswith(s, "sm_") ? SubString(s, 4) : s
+    m = match(r"^(\d+)(\d)([af]?)$", raw)
+    m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"sm_103a\", or \"100f\"")
     major = parse(Int, m.captures[1])
     minor = parse(Int, m.captures[2])
     fs = m.captures[3] == "a" ? :arch :
@@ -69,6 +86,13 @@ function Base.parse(::Type{SMVersion}, s::AbstractString)
     return SMVersion(major, minor, fs)
 end
 
+# Single-argument constructor: the universal normalizer for accepting an `arch`/`cap`-like
+# argument. Identity for SMVersion; baseline-promotes a plain VersionNumber; parses a
+# string (with or without the `sm_` prefix).
+SMVersion(sm::SMVersion) = sm
+SMVersion(v::VersionNumber) = SMVersion(v.major, v.minor, :baseline)
+SMVersion(s::AbstractString) = Base.parse(SMVersion, s)
+
 # Suffix on the LLVM CPU name / `.target` directive
 suffix(sm::SMVersion) = sm.feature_set === :arch    ? "a" :
                         sm.feature_set === :family  ? "f" : ""
@@ -103,9 +127,10 @@ Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, sm.minor, suffix(
     @sm_str
 
 String macro used to parse a string to an [`SMVersion`](@ref). Accepts NVIDIA's
-compact `sm_NN[a|f]` notation (without the `sm_` prefix): `sm"90"` for baseline,
-`sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific. The last
-digit before the optional suffix is the minor; everything before it is the major.
+compact `sm_NN[a|f]` notation (with or without the `sm_` prefix): `sm"90"` for
+baseline, `sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific.
+Equivalent to calling `SMVersion(str)`; parses at macro-expansion time, so the
+resulting `SMVersion` is a compile-time constant in the surrounding expression.
 
 # Examples
 ```julia
@@ -116,6 +141,4 @@ julia> sm"100f" == SMVersion(10, 0, :family)
 true
 ```
 """
-macro sm_str(s)
-    return :(Base.parse($SMVersion, $(esc(s))))
-end
+macro sm_str(s); SMVersion(s); end
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 30e472cd08..c6acc1106f 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -292,6 +292,14 @@ end
     @test_throws ErrorException parse(SMVersion, "100x")     # unknown suffix
     @test_throws ErrorException parse(SMVersion, "1")        # only one digit (need at least major + minor)
     @test_throws ErrorException parse(SMVersion, "")         # empty
+
+    # `SMVersion(x)` as the universal normalizer:
+    @test SMVersion(sm"103a")          === sm"103a"                        # identity
+    @test SMVersion(v"10.3")           == SMVersion(10, 3, :baseline)      # VersionNumber → baseline
+    @test SMVersion("103a")            == sm"103a"                         # bare string
+    @test SMVersion("sm_103a")         == sm"103a"                         # accepts NVIDIA prefix
+    # the macro is just a parse-time call to the constructor
+    @test sm"103a"                     == SMVersion("103a")
 end
 
 @testset "CUDACompilerParams hash discriminates on feature_set" begin

From 1cedc6393db040edc3977643bc007d0a9e5cb98c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 16:19:02 +0200
Subject: [PATCH 15/16] Drop overly specific tests.

---
 test/core/codegen.jl | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index c6acc1106f..db431db4ed 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -302,38 +302,6 @@ end
     @test sm"103a"                     == SMVersion("103a")
 end
 
-@testset "CUDACompilerParams hash discriminates on feature_set" begin
-    # Without feature_set in the hash, two params differing only on feature_set would collide
-    # in the compiler cache and silently return a cubin compiled for the wrong feature set.
-    base = CUDACore.CUDACompilerParams(sm=sm"90",  ptx=v"8.0")
-    arch = CUDACore.CUDACompilerParams(sm=sm"90a", ptx=v"8.0")
-    @test hash(base) != hash(arch)
-    @test base != arch
-end
-
-@testset "ptx_sm_support" begin
-    # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"90") and PTX >= 8.0.
-    # Family-specific needs CC >= 10.0 (no `*f` keys below sm"100") and PTX >= 8.8.
-    @test !(sm"86a" in CUDACore.ptx_sm_support(v"8.0"))   # no `*a` below CC 9.0
-    @test !(sm"90a" in CUDACore.ptx_sm_support(v"7.8"))   # `*a` requires PTX >= 8.0
-    @test !(sm"90f" in CUDACore.ptx_sm_support(v"8.0"))   # no `*f` below CC 10.0
-    @test !(sm"100f" in CUDACore.ptx_sm_support(v"8.7"))  # `*f` requires PTX >= 8.8
-    @test  sm"90a"  in CUDACore.ptx_sm_support(v"8.0")
-    @test  sm"100f" in CUDACore.ptx_sm_support(v"8.8")
-    @test  sm"50"   in CUDACore.ptx_sm_support(v"6.2")
-end
-
-@testset "llvm_sm_support" begin
-    # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td.
-    @test  sm"103a" in CUDACore.llvm_sm_support(v"21")
-    @test  sm"100f" in CUDACore.llvm_sm_support(v"21")
-    @test  sm"90a"  in CUDACore.llvm_sm_support(v"18")
-    @test !(sm"90a"  in CUDACore.llvm_sm_support(v"17"))  # sm_90a added in LLVM 18
-    @test !(sm"103a" in CUDACore.llvm_sm_support(v"20"))  # sm_103a added in LLVM 21
-    @test !(sm"100f" in CUDACore.llvm_sm_support(v"20"))  # sm_100f added in LLVM 21
-    @test  sm"70"   in CUDACore.llvm_sm_support(v"15")
-end
-
 end
 
 ############################################################################################

From cf5b4b6ea878909de617966bfd33c45d2885b51f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 22:02:46 +0200
Subject: [PATCH 16/16] Fix docs.

[only docs]
---
 CUDACore/src/compiler/sm.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl
index 420e280eea..86f566766a 100644
--- a/CUDACore/src/compiler/sm.jl
+++ b/CUDACore/src/compiler/sm.jl
@@ -11,7 +11,7 @@ subtarget feature set selected by the suffix on its `.target` directive. Printed
 parsed in NVIDIA's compact form -- `sm"90"` for compute capability 9.0, `sm"103a"`
 for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]`
 notation in the PTX ISA reference and to distinguish visually from a device-level
-[`VersionNumber`](@ref) like `v"9.0"`.
+`VersionNumber` like `v"9.0"`.
 
 The single-argument constructors normalize various inputs to an `SMVersion`: