JuliaGPU · maleadt · May 20, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.10"
+GPUCompiler = "1.12"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"

diff --git a/CUDACore/lib/cudadrv/state.jl b/CUDACore/lib/cudadrv/state.jl
@@ -227,7 +227,7 @@ function context(dev::CuDevice)
                maxlog=1, _id=devidx)
     end
     # ... or too new
-    if !in(capability(dev), cuda_compat().cap)
+    if !in(capability(dev), ptxas_compat().cap)
         @warn("""Your $(name(dev)) GPU (compute capability $(capability(dev).major).$(capability(dev).minor)) is not fully supported by CUDA $(runtime_version()).
                  Some functionality may be broken. Ensure you are using the latest version of CUDA.jl in combination with an up-to-date NVIDIA driver.
                  If that does not help, please file an issue to add support for the latest CUDA toolkit.""",

diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl
@@ -80,6 +80,7 @@ include("../lib/cudadrv/CUDAdrv.jl")
 
 # essential stuff
 include("initialization.jl")
+include("compiler/sm.jl")
 include("compatibility.jl")
 include("debug.jl")
 

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
@@ -1,15 +1,25 @@
 # compatibility of Julia, CUDA and LLVM
 
-# NOTE: Target architectures with suffix “a”, such as sm_90a, include
-# architecture-accelerated features that are supported on the specified architecture only,
-# hence such targets do not follow the onion layer model. Therefore, PTX code generated for
-# such targets cannot be run on later generation devices. Architecture-accelerated features
-# can only be used with targets that support these features.
-
 const lowest = v"0"
 const highest = v"999"
 
 
+# PTX compilation targets come in three feature-set flavors (carried on `SMVersion`),
+# selected via the suffix on the `.target` directive (and the matching `--gpu-name`
+# to ptxas):
+#
+#   - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
+#     for sm_X runs on any sm_Y with Y >= X (onion model).
+#   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
+#     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
+#   - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
+#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices.
+#
+# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX
+# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db`
+# below: an unsupported combination simply has no entry.
+
+
 ## version range
 
 struct VersionRange
@@ -26,12 +36,12 @@ Base.intersect(v::VersionNumber, r::VersionRange) =
     v > r.upper ? (v:r.upper) : (v:v)
 
 
-## devices supported by the CUDA toolkit
+## devices supported by ptxas
 
 # Source:
 # - https://en.wikipedia.org/wiki/CUDA#GPUs_supported
 # - ptxas |& grep -A 10 '\--gpu-name'
-const cuda_cap_db = Dict(
+const ptxas_cap_db = Dict(
     v"1.0"   => between(lowest, v"6.5"),
     v"1.1"   => between(lowest, v"6.5"),
     v"1.2"   => between(lowest, v"6.5"),
@@ -63,9 +73,9 @@ const cuda_cap_db = Dict(
     v"12.1"  => between(v"12.9", highest),
 )
 
-function cuda_cap_support(ver::VersionNumber)
+function ptxas_cap_support(ver::VersionNumber)
     caps = Set{VersionNumber}()
-    for (cap,r) in cuda_cap_db
+    for (cap,r) in ptxas_cap_db
         if ver in r
             push!(caps, cap)
         end
@@ -74,10 +84,10 @@ function cuda_cap_support(ver::VersionNumber)
 end
 
 
-## PTX ISAs supported by the CUDA toolkit
+## PTX ISAs supported by ptxas
 
 # Source: PTX ISA document, Release History table
-const cuda_ptx_db = Dict(
+const ptxas_ptx_db = Dict(
     v"1.0" => between(v"1.0", highest),
     v"1.1" => between(v"1.1", highest),
     v"1.2" => between(v"2.0", highest),
@@ -125,9 +135,9 @@ const cuda_ptx_db = Dict(
     v"9.2" => between(v"13.2", highest),
 )
 
-function cuda_ptx_support(ver::VersionNumber)
+function ptxas_ptx_support(ver::VersionNumber)
     caps = Set{VersionNumber}()
-    for (cap,r) in cuda_ptx_db
+    for (cap,r) in ptxas_ptx_db
         if ver in r
             push!(caps, cap)
         end
@@ -138,52 +148,54 @@ end
 
 ## devices supported by each PTX ISA
 
-# Source: PTX ISA document, Release History table
-const ptx_cap_db = Dict(
-    v"1.0"   => between(v"1.0", highest),
-    v"1.1"   => between(v"1.0", highest),
-    v"1.2"   => between(v"1.2", highest),
-    v"1.3"   => between(v"1.2", highest),
-    v"2.0"   => between(v"2.0", highest),
-    v"3.0"   => between(v"3.1", highest),
-    v"3.2"   => between(v"4.0", highest),
-    v"3.5"   => between(v"3.1", highest),
-    v"3.7"   => between(v"4.1", highest),
-    v"5.0"   => between(v"4.0", highest),
-    v"5.2"   => between(v"4.1", highest),
-    v"5.3"   => between(v"4.2", highest),
-    v"6.0"   => between(v"5.0", highest),
-    v"6.1"   => between(v"5.0", highest),
-    v"6.2"   => between(v"5.0", highest),
-    v"7.0"   => between(v"6.0", highest),
-    v"7.2"   => between(v"6.1", highest),
-    v"7.5"   => between(v"6.3", highest),
-    v"8.0"   => between(v"7.0", highest),
-    v"8.6"   => between(v"7.1", highest),
-    v"8.7"   => between(v"7.4", highest),
-    v"8.9"   => between(v"7.8", highest),
-    v"9.0"   => between(v"7.8", highest),
-    #v"9.0a" => between(v"8.0", highest)
-    v"10.0"  => between(v"8.6", highest),
-    #v"10.0a"=> between(v"8.6", highest),
-    #v"10.0f"=> between(v"8.8", highest),
-    v"10.1"  => between(v"8.6", highest),
-    #v"10.1a"=> between(v"8.6", highest),
-    #v"10.1f"=> between(v"8.8", highest),
-    v"10.3"  => between(v"8.8", highest),
-    #v"10.3a"=> between(v"8.8", highest),
-    #v"10.3f"=> between(v"8.8", highest),
-    v"12.0"  => between(v"8.7", highest),
-    #v"12.0a"=> between(v"8.7", highest),
-    #v"12.0f"=> between(v"8.8", highest),
-    v"12.1"  => between(v"8.8", highest),
-    #v"12.1a"=> between(v"8.8", highest),
-    #v"12.1f"=> between(v"8.8", highest),
+# Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants
+# were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8.
+const ptx_sm_db = Dict{SMVersion, VersionRange}(
+    sm"10"   => between(v"1.0", highest),
+    sm"11"   => between(v"1.0", highest),
+    sm"12"   => between(v"1.2", highest),
+    sm"13"   => between(v"1.2", highest),
+    sm"20"   => between(v"2.0", highest),
+    sm"30"   => between(v"3.1", highest),
+    sm"32"   => between(v"4.0", highest),
+    sm"35"   => between(v"3.1", highest),
+    sm"37"   => between(v"4.1", highest),
+    sm"50"   => between(v"4.0", highest),
+    sm"52"   => between(v"4.1", highest),
+    sm"53"   => between(v"4.2", highest),
+    sm"60"   => between(v"5.0", highest),
+    sm"61"   => between(v"5.0", highest),
+    sm"62"   => between(v"5.0", highest),
+    sm"70"   => between(v"6.0", highest),
+    sm"72"   => between(v"6.1", highest),
+    sm"75"   => between(v"6.3", highest),
+    sm"80"   => between(v"7.0", highest),
+    sm"86"   => between(v"7.1", highest),
+    sm"87"   => between(v"7.4", highest),
+    sm"89"   => between(v"7.8", highest),
+    sm"90"   => between(v"7.8", highest),
+    sm"90a"  => between(v"8.0", highest),
+    sm"100"  => between(v"8.6", highest),
+    sm"100a" => between(v"8.6", highest),
+    sm"100f" => between(v"8.8", highest),
+    sm"101"  => between(v"8.6", highest),
+    sm"101a" => between(v"8.6", highest),
+    sm"101f" => between(v"8.8", highest),
+    sm"103"  => between(v"8.8", highest),
+    sm"103a" => between(v"8.8", highest),
+    sm"103f" => between(v"8.8", highest),
+    sm"120"  => between(v"8.7", highest),
+    sm"120a" => between(v"8.7", highest),
+    sm"120f" => between(v"8.8", highest),
+    sm"121"  => between(v"8.8", highest),
+    sm"121a" => between(v"8.8", highest),
+    sm"121f" => between(v"8.8", highest),
 )
 
-function ptx_cap_support(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap,r) in ptx_cap_db
+# Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`.
+function ptx_sm_support(ver::VersionNumber)
+    caps = Set{SMVersion}()
+    for (cap, r) in ptx_sm_db
         if ver in r
             push!(caps, cap)
         end
@@ -194,44 +206,52 @@ end
 
 ## devices supported by the LLVM NVPTX back-end
 
-# Source: LLVM/lib/Target/NVPTX/NVPTX.td
-const llvm_cap_db = Dict(
-    v"2.0"   => between(v"3.2", highest),
-    v"2.1"   => between(v"3.2", highest),
-    v"3.0"   => between(v"3.2", highest),
-    v"3.2"   => between(v"3.7", highest),
-    v"3.5"   => between(v"3.2", highest),
-    v"3.7"   => between(v"3.7", highest),
-    v"5.0"   => between(v"3.5", highest),
-    v"5.2"   => between(v"3.7", highest),
-    v"5.3"   => between(v"3.7", highest),
-    v"6.0"   => between(v"3.9", highest),
-    v"6.1"   => between(v"3.9", highest),
-    v"6.2"   => between(v"3.9", highest),
-    v"7.0"   => between(v"6", highest),
-    v"7.2"   => between(v"7", highest),
-    v"7.5"   => between(v"8", highest),
-    v"8.0"   => between(v"11", highest),
-    v"8.6"   => between(v"13", highest),
-    v"8.7"   => between(v"16", highest),
-    v"8.9"   => between(v"16", highest),
-    v"9.0"   => between(v"16", highest),
-    #v"9.0a" => between(v"18", highest),
-    v"10.0"  => between(v"20", highest),
-    #v"10.0a"=> between(v"20", highest),
-    v"10.1"  => between(v"20", highest),
-    #v"10.1a"=> between(v"20", highest),
-    v"10.3"  => between(v"21", highest),
-    #v"10.3a"=> between(v"21", highest),
-    v"12.0"  => between(v"20", highest),
-    #v"12.0a"=> between(v"20", highest),
-    v"12.1"  => between(v"21", highest),
-    #v"12.1a"=> between(v"21", highest),
+# Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up
+# here as a separate entry; without an entry LLVM does not know the variant CPU name and
+# constructing a TargetMachine with it would fall back to a generic subtarget.
+const llvm_sm_db = Dict{SMVersion, VersionRange}(
+    sm"20"   => between(v"3.2", highest),
+    sm"21"   => between(v"3.2", highest),
+    sm"30"   => between(v"3.2", highest),
+    sm"32"   => between(v"3.7", highest),
+    sm"35"   => between(v"3.2", highest),
+    sm"37"   => between(v"3.7", highest),
+    sm"50"   => between(v"3.5", highest),
+    sm"52"   => between(v"3.7", highest),
+    sm"53"   => between(v"3.7", highest),
+    sm"60"   => between(v"3.9", highest),
+    sm"61"   => between(v"3.9", highest),
+    sm"62"   => between(v"3.9", highest),
+    sm"70"   => between(v"6", highest),
+    sm"72"   => between(v"7", highest),
+    sm"75"   => between(v"8", highest),
+    sm"80"   => between(v"11", highest),
+    sm"86"   => between(v"13", highest),
+    sm"87"   => between(v"16", highest),
+    sm"89"   => between(v"16", highest),
+    sm"90"   => between(v"16", highest),
+    sm"90a"  => between(v"18", highest),
+    sm"100"  => between(v"20", highest),
+    sm"100a" => between(v"20", highest),
+    sm"100f" => between(v"21", highest),
+    sm"101"  => between(v"20", highest),
+    sm"101a" => between(v"20", highest),
+    sm"101f" => between(v"21", highest),
+    sm"103"  => between(v"21", highest),
+    sm"103a" => between(v"21", highest),
+    sm"103f" => between(v"21", highest),
+    sm"120"  => between(v"20", highest),
+    sm"120a" => between(v"20", highest),
+    sm"120f" => between(v"21", highest),
+    sm"121"  => between(v"21", highest),
+    sm"121a" => between(v"21", highest),
+    sm"121f" => between(v"21", highest),
 )
 
-function llvm_cap_support(ver::VersionNumber)
-    caps = Set{VersionNumber}()
-    for (cap,r) in llvm_cap_db
+# Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`.
+function llvm_sm_support(ver::VersionNumber)
+    caps = Set{SMVersion}()
+    for (cap, r) in llvm_sm_db
         if ver in r
             push!(caps, cap)
         end
@@ -295,32 +315,14 @@ end
 function llvm_compat(version=LLVM.version())
     LLVM.InitializeNVPTXTarget()
 
-    cap_support = sort(collect(llvm_cap_support(version)))
-    ptx_support = sort(collect(llvm_ptx_support(version)))
-
-    return (cap=cap_support, ptx=ptx_support)
-end
-
-function cuda_compat(runtime=runtime_version(), compiler=compiler_version())
-    # we don't have to check the driver version, because it offers backwards compatbility
-    # beyond the CUDA toolkit version (e.g. R580 for CUDA 13 still supports Volta as
-    # deprecated in CUDA 13), and we don't have a reliable way to query the actual version
-    # as NVML isn't available on all platforms. let's instead simply assume that unsupported
-    # devices will not be exposed to the CUDA runtime and thus won't be visible to us.
-
-    # the compiler and runtime are versioned independently (and either can come from a
-    # local install), so we need to consider both:
-    # - device caps are dropped when either ptxas can't emit for them or the runtime
-    #   libraries drop them. take the intersection of both supported sets.
-    # - PTX ISA availability is a property of ptxas; the runtime doesn't care which ISA
-    #   compiled cubin came from.
-    cap_support = sort(collect(intersect(cuda_cap_support(runtime),
-                                         cuda_cap_support(compiler))))
-    ptx_support = sort(collect(cuda_ptx_support(compiler)))
-
-    return (cap=cap_support, ptx=ptx_support)
+    # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`.
+    # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level
+    # support is per-CC -- the names track the value type.
+    return (sm=llvm_sm_support(version),
+            ptx=llvm_ptx_support(version))
 end
 
-function ptx_compat(ptx)
-    return (cap=ptx_cap_support(ptx),)
+function ptxas_compat(version=compiler_version())
+    return (cap=ptxas_cap_support(version),
+            ptx=ptxas_ptx_support(version))
 end