From 8d65f63b0301db1fd398e593261fad401bf62123 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 09:54:32 +0000 Subject: [PATCH 01/16] Add architecture- and family-specific PTX target support --- CUDACore/src/compatibility.jl | 56 +++++++++++++++++----------- CUDACore/src/compiler/compilation.jl | 38 +++++++++++++++---- test/core/codegen.jl | 32 ++++++++++++++++ 3 files changed, 96 insertions(+), 30 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index afb833b32a..c4cbf177a3 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -1,14 +1,43 @@ # compatibility of Julia, CUDA and LLVM -# NOTE: Target architectures with suffix “a”, such as sm_90a, include -# architecture-accelerated features that are supported on the specified architecture only, -# hence such targets do not follow the onion layer model. Therefore, PTX code generated for -# such targets cannot be run on later generation devices. Architecture-accelerated features -# can only be used with targets that support these features. +# PTX compilation targets come in three feature-set flavors, selected via the suffix on the +# `.target` directive (and the matching `--gpu-name` to ptxas): +# +# - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled +# for sm_X runs on any sm_Y with Y >= X (onion model). +# - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable; +# code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families. +# Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap. +# - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one +# exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with +# CC 9.0; uses the same PTX requirement as the plain target. +# +# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma, +# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across +# devices in the same major family; baseline unlocks only the forward-portable set. const lowest = v"0" const highest = v"999" +# PTX compilation target feature set; see top-of-file note for the hierarchy and rules. +@enum PTXTargetKind Baseline Family Architectural + +# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the +# kind PTX floors are uniform across caps, so we encode them here rather than in the +# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax +# was introduced in PTX 8.0; the `f` syntax in PTX 8.8. +function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind) + if kind === Architectural + cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap") + ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx") + end + if kind === Family + cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap") + ptx >= v"8.8" || error("Family targets require PTX ISA >= 8.8; got $ptx") + end + return +end + ## version range @@ -163,22 +192,11 @@ const ptx_cap_db = Dict( v"8.7" => between(v"7.4", highest), v"8.9" => between(v"7.8", highest), v"9.0" => between(v"7.8", highest), - #v"9.0a" => between(v"8.0", highest) v"10.0" => between(v"8.6", highest), - #v"10.0a"=> between(v"8.6", highest), - #v"10.0f"=> between(v"8.8", highest), v"10.1" => between(v"8.6", highest), - #v"10.1a"=> between(v"8.6", highest), - #v"10.1f"=> between(v"8.8", highest), v"10.3" => between(v"8.8", highest), - #v"10.3a"=> between(v"8.8", highest), - #v"10.3f"=> between(v"8.8", highest), v"12.0" => between(v"8.7", highest), - #v"12.0a"=> between(v"8.7", highest), - #v"12.0f"=> between(v"8.8", highest), v"12.1" => between(v"8.8", highest), - #v"12.1a"=> between(v"8.8", highest), - #v"12.1f"=> between(v"8.8", highest), ) function ptx_cap_support(ver::VersionNumber) @@ -216,17 +234,11 @@ const llvm_cap_db = Dict( v"8.7" => between(v"16", highest), v"8.9" => between(v"16", highest), v"9.0" => between(v"16", highest), - #v"9.0a" => between(v"18", highest), v"10.0" => between(v"20", highest), - #v"10.0a"=> between(v"20", highest), v"10.1" => between(v"20", highest), - #v"10.1a"=> between(v"20", highest), v"10.3" => between(v"21", highest), - #v"10.3a"=> between(v"21", highest), v"12.0" => between(v"20", highest), - #v"12.0a"=> between(v"20", highest), v"12.1" => between(v"21", highest), - #v"12.1a"=> between(v"21", highest), ) function llvm_cap_support(ver::VersionNumber) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 1b60e42602..b3a5e2fc30 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -3,15 +3,27 @@ Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams cap::VersionNumber ptx::VersionNumber + kind::PTXTargetKind = Baseline end function Base.hash(params::CUDACompilerParams, h::UInt) h = hash(params.cap, h) h = hash(params.ptx, h) + h = hash(params.kind, h) return h end +# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target` +# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural` +# (ptxas requires exact match) and need to be in the same major family for `kind=Family`; +# emitting the same string on both sides handles all three kinds correctly. +function format_target(cap::VersionNumber, kind::PTXTargetKind) + suffix = kind === Architectural ? "a" : + kind === Family ? "f" : "" + return "sm_$(cap.major)$(cap.minor)$suffix" +end + const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams} const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams} @@ -119,10 +131,10 @@ end # stamp `.version` with the ISA we want `ptxas` to validate against # and `.target` with the arch that `--gpu-name` will use -function rewrite_ptx_header(asm, ptx, cap) +function rewrite_ptx_header(asm, ptx, cap, kind=Baseline) return replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)", - r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)") + r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))") end function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format) @@ -145,9 +157,9 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module asm = replace(asm, r"(\.target .+), debug" => s"\1") end - (; ptx, cap) = job.config.params - if job.config.target.ptx != ptx || job.config.target.cap != cap - asm = rewrite_ptx_header(asm, ptx, cap) + (; ptx, cap, kind) = job.config.params + if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline + asm = rewrite_ptx_header(asm, ptx, cap, kind) end return asm @@ -179,7 +191,7 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kwargs...) + cap=nothing, ptx=nothing, kind=nothing, kwargs...) # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -236,9 +248,18 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = compiler_version() >= v"11.7" + # default the target feature set based on the device cap. Architectural is the + # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict + # superset of Baseline, and the cubin is per-device anyway so portability isn't on + # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline. + if kind === nothing + kind = cuda_cap >= v"9.0" ? Architectural : Baseline + end + validate_target_kind(cuda_cap, cuda_ptx, kind) + # create GPUCompiler objects target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) - params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx) + params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind) CompilerConfig(target, params; kernel, name, always_inline) end @@ -275,7 +296,8 @@ function compile(@nospecialize(job::CompilerJob)) ptx = job.config.params.ptx cap = job.config.params.cap - arch = "sm_$(cap.major)$(cap.minor)" + kind = job.config.params.kind + arch = format_target(cap, kind) # validate use of parameter memory argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 02f275aa7d..f813197b46 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -259,6 +259,38 @@ end @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) + + # Architectural kind appends an `a` suffix to the .target directive (and the same + # string is what `compile()` passes to --gpu-name, since ptxas requires exact match + # for `a`-mode). + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural) + @test occursin(".target sm_90a", asm_arch) + @test success(run_ptxas(asm_arch, "sm_90a")) + + # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line. + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family) + @test occursin(".target sm_100f", asm_family) + @test success(run_ptxas(asm_family, "sm_100f")) +end + +@testset "CUDACompilerParams hash discriminates on kind" begin + # Without `kind` in the hash, two params differing only on kind would collide in + # the compiler cache and silently return a cubin compiled for the wrong feature set. + base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline) + arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural) + @test hash(base) != hash(arch) + @test base != arch +end + +@testset "validate_target_kind" begin + # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8. + @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural) + @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural) + @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family) + @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family) + @test CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Architectural) === nothing + @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing + @test CUDACore.validate_target_kind(v"5.0", v"6.2", CUDACore.Baseline) === nothing end end From 9f254a296d29b505fdd6fe71cd84247ef44e2524 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 10:24:39 +0000 Subject: [PATCH 02/16] Remove `kind` compiler config kwarg --- CUDACore/src/compiler/compilation.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index b3a5e2fc30..cd6e9d4b02 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -191,7 +191,7 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kind=nothing, kwargs...) + cap=nothing, ptx=nothing, kwargs...) # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -248,14 +248,11 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = compiler_version() >= v"11.7" - # default the target feature set based on the device cap. Architectural is the + # pick the target feature set based on the device cap. Architectural is the # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict # superset of Baseline, and the cubin is per-device anyway so portability isn't on # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline. - if kind === nothing - kind = cuda_cap >= v"9.0" ? Architectural : Baseline - end - validate_target_kind(cuda_cap, cuda_ptx, kind) + kind = cuda_cap >= v"9.0" ? Architectural : Baseline # create GPUCompiler objects target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) From 440708d515ede17ae77d7a2cf7f5d51f750b6259 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 19:28:09 +0000 Subject: [PATCH 03/16] Add `feature_set` compiler config kwarg; use symbols instead of enum type --- CUDACore/src/compatibility.jl | 38 +++++++----------- CUDACore/src/compiler/compilation.jl | 58 ++++++++++++++++------------ CUDACore/src/compiler/execution.jl | 4 +- test/core/codegen.jl | 42 ++++++++++---------- 4 files changed, 72 insertions(+), 70 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index c4cbf177a3..4e58578691 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -1,5 +1,9 @@ # compatibility of Julia, CUDA and LLVM +const lowest = v"0" +const highest = v"999" + + # PTX compilation targets come in three feature-set flavors, selected via the suffix on the # `.target` directive (and the matching `--gpu-name` to ptxas): # @@ -8,32 +12,20 @@ # - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable; # code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families. # Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap. -# - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one -# exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with +# - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one +# exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with # CC 9.0; uses the same PTX requirement as the plain target. # -# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma, -# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across -# devices in the same major family; baseline unlocks only the forward-portable set. - -const lowest = v"0" -const highest = v"999" - -# PTX compilation target feature set; see top-of-file note for the hierarchy and rules. -@enum PTXTargetKind Baseline Family Architectural - -# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the -# kind PTX floors are uniform across caps, so we encode them here rather than in the -# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax -# was introduced in PTX 8.0; the `f` syntax in PTX 8.8. -function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind) - if kind === Architectural - cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap") - ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx") +function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol) + if !(feature_set in (:baseline, :family, :architecture)) + error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))") end - if kind === Family - cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap") - ptx >= v"8.8" || error("Family targets require PTX ISA >= 8.8; got $ptx") + if feature_set === :architecture + cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap") + ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx") + elseif feature_set === :family + cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap") + ptx >= v"8.8" || error("Family-specific targets require PTX ISA >= 8.8; got $ptx") end return end diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index cd6e9d4b02..2e21e3b603 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -3,24 +3,24 @@ Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams cap::VersionNumber ptx::VersionNumber - kind::PTXTargetKind = Baseline + feature_set::Symbol = :baseline end function Base.hash(params::CUDACompilerParams, h::UInt) h = hash(params.cap, h) h = hash(params.ptx, h) - h = hash(params.kind, h) + h = hash(params.feature_set, h) return h end -# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target` -# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural` -# (ptxas requires exact match) and need to be in the same major family for `kind=Family`; -# emitting the same string on both sides handles all three kinds correctly. -function format_target(cap::VersionNumber, kind::PTXTargetKind) - suffix = kind === Architectural ? "a" : - kind === Family ? "f" : "" +# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target` +# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture` +# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`; +# emitting the same string on both sides handles all three feature sets correctly. +function format_target(cap::VersionNumber, feature_set::Symbol) + suffix = feature_set === :architecture ? "a" : + feature_set === :family ? "f" : "" return "sm_$(cap.major)$(cap.minor)$suffix" end @@ -131,10 +131,10 @@ end # stamp `.version` with the ISA we want `ptxas` to validate against # and `.target` with the arch that `--gpu-name` will use -function rewrite_ptx_header(asm, ptx, cap, kind=Baseline) +function rewrite_ptx_header(asm, ptx, cap, feature_set) return replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)", - r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))") + r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))") end function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format) @@ -157,9 +157,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module asm = replace(asm, r"(\.target .+), debug" => s"\1") end - (; ptx, cap, kind) = job.config.params - if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline - asm = rewrite_ptx_header(asm, ptx, cap, kind) + (; ptx, cap, feature_set) = job.config.params + needs_rewrite = job.config.target.ptx != ptx || + job.config.target.cap != cap || + feature_set !== :baseline + if needs_rewrite + asm = rewrite_ptx_header(asm, ptx, cap, feature_set) end return asm @@ -191,7 +194,7 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kwargs...) + cap=nothing, ptx=nothing, feature_set=nothing, kwargs...) # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -248,15 +251,22 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = compiler_version() >= v"11.7" - # pick the target feature set based on the device cap. Architectural is the - # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict - # superset of Baseline, and the cubin is per-device anyway so portability isn't on - # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline. - kind = cuda_cap >= v"9.0" ? Architectural : Baseline + # Pick the target feature set based on the device cap. + # Architecture-specific is chosen for devices where it's + # available (CC >= 9.0) since it's a strict superset of + # the baseline and family feature sets. + if feature_set === nothing + feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0" + :architecture + else + :baseline + end + end + validate_feature_set(cuda_cap, cuda_ptx, feature_set) # create GPUCompiler objects target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) - params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind) + params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set) CompilerConfig(target, params; kernel, name, always_inline) end @@ -291,10 +301,8 @@ function compile(@nospecialize(job::CompilerJob)) push!(ptxas_opts, "--compile-only") end - ptx = job.config.params.ptx - cap = job.config.params.cap - kind = job.config.params.kind - arch = format_target(cap, kind) + (; ptx, cap, feature_set) = job.config.params + arch = format_target(cap, feature_set) # validate use of parameter memory argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index 130d049e7c..7cfe96e1a9 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} = ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream] @@ -434,6 +434,8 @@ The following keyword arguments are supported: - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for +- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the + most specific supported by the device The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index f813197b46..48af0d92f5 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -255,42 +255,42 @@ end @test !success(run_ptxas(asm_pre, "sm_75")) - asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0") + asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline) @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) - # Architectural kind appends an `a` suffix to the .target directive (and the same - # string is what `compile()` passes to --gpu-name, since ptxas requires exact match - # for `a`-mode). - asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural) + # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same + # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture) @test occursin(".target sm_90a", asm_arch) @test success(run_ptxas(asm_arch, "sm_90a")) - # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line. - asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family) + # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line. + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family) @test occursin(".target sm_100f", asm_family) @test success(run_ptxas(asm_family, "sm_100f")) end -@testset "CUDACompilerParams hash discriminates on kind" begin - # Without `kind` in the hash, two params differing only on kind would collide in - # the compiler cache and silently return a cubin compiled for the wrong feature set. - base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline) - arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural) +@testset "CUDACompilerParams hash discriminates on feature_set" begin + # Without feature_set in the hash, two params differing only on feature_set would collide + # in the compiler cache and silently return a cubin compiled for the wrong feature set. + base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline) + arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture) @test hash(base) != hash(arch) @test base != arch end -@testset "validate_target_kind" begin - # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8. - @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural) - @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural) - @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family) - @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family) - @test CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Architectural) === nothing - @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing - @test CUDACore.validate_target_kind(v"5.0", v"6.2", CUDACore.Baseline) === nothing +@testset "validate_feature_set" begin + # Architecture-specific needs CC >= 9.0 and PTX >= 8.0 + # Family-specific needs CC >= 10.0 and PTX >= 8.8. + @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture) + @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture) + @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family) + @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family) + @test CUDACore.validate_feature_set(v"9.0", v"8.0", :architecture) === nothing + @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing + @test CUDACore.validate_feature_set(v"5.0", v"6.2", :baseline) === nothing end end From 74e8ddd24d5ec7291b3d6d0556f6d4b338e4849b Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 19:59:39 +0000 Subject: [PATCH 04/16] Require opt-in through `feature_set` to avoid breaking changes --- CUDACore/src/compiler/compilation.jl | 14 +++----------- CUDACore/src/compiler/execution.jl | 3 +-- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 2e21e3b603..6b859957eb 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -251,17 +251,9 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = compiler_version() >= v"11.7" - # Pick the target feature set based on the device cap. - # Architecture-specific is chosen for devices where it's - # available (CC >= 9.0) since it's a strict superset of - # the baseline and family feature sets. - if feature_set === nothing - feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0" - :architecture - else - :baseline - end - end + # Conservatively pick baseline for backward compatibility, + # requiring explicit opt-in for family- and architecture-specific instructions. + feature_set = something(feature_set, :baseline) validate_feature_set(cuda_cap, cuda_ptx, feature_set) # create GPUCompiler objects diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index 7cfe96e1a9..e5359e50fc 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -434,8 +434,7 @@ The following keyword arguments are supported: - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for -- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the - most specific supported by the device +- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture` The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when From 1faeb7eb6f3b77cd310452b50b73b9ebbba27ebe Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Fri, 1 May 2026 15:41:37 +0000 Subject: [PATCH 05/16] Shorten `:architecture` to `:arch` --- CUDACore/src/compatibility.jl | 6 +++--- CUDACore/src/compiler/compilation.jl | 4 ++-- CUDACore/src/compiler/execution.jl | 2 +- test/core/codegen.jl | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 4e58578691..08506a37af 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -17,10 +17,10 @@ const highest = v"999" # CC 9.0; uses the same PTX requirement as the plain target. # function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol) - if !(feature_set in (:baseline, :family, :architecture)) - error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))") + if !(feature_set in (:baseline, :family, :arch)) + error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))") end - if feature_set === :architecture + if feature_set === :arch cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap") ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx") elseif feature_set === :family diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 6b859957eb..736e7ef23e 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -15,11 +15,11 @@ function Base.hash(params::CUDACompilerParams, h::UInt) end # Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target` -# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture` +# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch` # (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`; # emitting the same string on both sides handles all three feature sets correctly. function format_target(cap::VersionNumber, feature_set::Symbol) - suffix = feature_set === :architecture ? "a" : + suffix = feature_set === :arch ? "a" : feature_set === :family ? "f" : "" return "sm_$(cap.major)$(cap.minor)$suffix" end diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index e5359e50fc..619acaa327 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -434,7 +434,7 @@ The following keyword arguments are supported: - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for -- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture` +- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch` The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 48af0d92f5..2096439b6b 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -262,7 +262,7 @@ end # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). - asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture) + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch) @test occursin(".target sm_90a", asm_arch) @test success(run_ptxas(asm_arch, "sm_90a")) @@ -276,7 +276,7 @@ end # Without feature_set in the hash, two params differing only on feature_set would collide # in the compiler cache and silently return a cubin compiled for the wrong feature set. base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline) - arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture) + arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch) @test hash(base) != hash(arch) @test base != arch end @@ -284,11 +284,11 @@ end @testset "validate_feature_set" begin # Architecture-specific needs CC >= 9.0 and PTX >= 8.0 # Family-specific needs CC >= 10.0 and PTX >= 8.8. - @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture) - @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture) + @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch) + @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch) @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family) @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family) - @test CUDACore.validate_feature_set(v"9.0", v"8.0", :architecture) === nothing + @test CUDACore.validate_feature_set(v"9.0", v"8.0", :arch) === nothing @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing @test CUDACore.validate_feature_set(v"5.0", v"6.2", :baseline) === nothing end From fbfb253f5e45e9028b53dc039dfa3557f61dcd59 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 08:38:06 +0200 Subject: [PATCH 06/16] Unify version databases again using a sm string. --- CUDACore/src/CUDACore.jl | 1 + CUDACore/src/compatibility.jl | 205 ++++++++++++++++----------- CUDACore/src/compiler/compilation.jl | 96 ++++++++----- CUDACore/src/compiler/execution.jl | 9 +- CUDACore/src/compiler/sm.jl | 56 ++++++++ CUDACore/src/device/runtime.jl | 2 +- CUDACore/src/precompile.jl | 2 +- perf/volumerhs.jl | 22 +++ test/core/codegen.jl | 62 ++++++-- test/core/execution.jl | 31 +++- 10 files changed, 346 insertions(+), 140 deletions(-) create mode 100644 CUDACore/src/compiler/sm.jl diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl index fa2bfa2b45..c0332759ff 100644 --- a/CUDACore/src/CUDACore.jl +++ b/CUDACore/src/CUDACore.jl @@ -80,6 +80,7 @@ include("../lib/cudadrv/CUDAdrv.jl") # essential stuff include("initialization.jl") +include("compiler/sm.jl") include("compatibility.jl") include("debug.jl") diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 08506a37af..770d717834 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -4,31 +4,20 @@ const lowest = v"0" const highest = v"999" -# PTX compilation targets come in three feature-set flavors, selected via the suffix on the -# `.target` directive (and the matching `--gpu-name` to ptxas): +# PTX compilation targets come in three feature-set flavors (carried on `SMVersion`), +# selected via the suffix on the `.target` directive (and the matching `--gpu-name` +# to ptxas): # # - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled # for sm_X runs on any sm_Y with Y >= X (onion model). # - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable; # code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families. -# Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap. # - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one -# exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with -# CC 9.0; uses the same PTX requirement as the plain target. +# exact CC; code compiled for sm_103a runs only on CC 10.3 devices. # -function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol) - if !(feature_set in (:baseline, :family, :arch)) - error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))") - end - if feature_set === :arch - cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap") - ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx") - elseif feature_set === :family - cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap") - ptx >= v"8.8" || error("Family-specific targets require PTX ISA >= 8.8; got $ptx") - end - return -end +# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX +# require for them, is encoded directly in the keys of `ptx_cap_db` and `llvm_cap_db` +# below: an unsupported combination simply has no entry. ## version range @@ -159,41 +148,54 @@ end ## devices supported by each PTX ISA -# Source: PTX ISA document, Release History table -const ptx_cap_db = Dict( - v"1.0" => between(v"1.0", highest), - v"1.1" => between(v"1.0", highest), - v"1.2" => between(v"1.2", highest), - v"1.3" => between(v"1.2", highest), - v"2.0" => between(v"2.0", highest), - v"3.0" => between(v"3.1", highest), - v"3.2" => between(v"4.0", highest), - v"3.5" => between(v"3.1", highest), - v"3.7" => between(v"4.1", highest), - v"5.0" => between(v"4.0", highest), - v"5.2" => between(v"4.1", highest), - v"5.3" => between(v"4.2", highest), - v"6.0" => between(v"5.0", highest), - v"6.1" => between(v"5.0", highest), - v"6.2" => between(v"5.0", highest), - v"7.0" => between(v"6.0", highest), - v"7.2" => between(v"6.1", highest), - v"7.5" => between(v"6.3", highest), - v"8.0" => between(v"7.0", highest), - v"8.6" => between(v"7.1", highest), - v"8.7" => between(v"7.4", highest), - v"8.9" => between(v"7.8", highest), - v"9.0" => between(v"7.8", highest), - v"10.0" => between(v"8.6", highest), - v"10.1" => between(v"8.6", highest), - v"10.3" => between(v"8.8", highest), - v"12.0" => between(v"8.7", highest), - v"12.1" => between(v"8.8", highest), +# Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants +# were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8. +const ptx_cap_db = Dict{SMVersion, VersionRange}( + sm"1.0" => between(v"1.0", highest), + sm"1.1" => between(v"1.0", highest), + sm"1.2" => between(v"1.2", highest), + sm"1.3" => between(v"1.2", highest), + sm"2.0" => between(v"2.0", highest), + sm"3.0" => between(v"3.1", highest), + sm"3.2" => between(v"4.0", highest), + sm"3.5" => between(v"3.1", highest), + sm"3.7" => between(v"4.1", highest), + sm"5.0" => between(v"4.0", highest), + sm"5.2" => between(v"4.1", highest), + sm"5.3" => between(v"4.2", highest), + sm"6.0" => between(v"5.0", highest), + sm"6.1" => between(v"5.0", highest), + sm"6.2" => between(v"5.0", highest), + sm"7.0" => between(v"6.0", highest), + sm"7.2" => between(v"6.1", highest), + sm"7.5" => between(v"6.3", highest), + sm"8.0" => between(v"7.0", highest), + sm"8.6" => between(v"7.1", highest), + sm"8.7" => between(v"7.4", highest), + sm"8.9" => between(v"7.8", highest), + sm"9.0" => between(v"7.8", highest), + sm"9.0a" => between(v"8.0", highest), + sm"10.0" => between(v"8.6", highest), + sm"10.0a" => between(v"8.6", highest), + sm"10.0f" => between(v"8.8", highest), + sm"10.1" => between(v"8.6", highest), + sm"10.1a" => between(v"8.6", highest), + sm"10.1f" => between(v"8.8", highest), + sm"10.3" => between(v"8.8", highest), + sm"10.3a" => between(v"8.8", highest), + sm"10.3f" => between(v"8.8", highest), + sm"12.0" => between(v"8.7", highest), + sm"12.0a" => between(v"8.7", highest), + sm"12.0f" => between(v"8.8", highest), + sm"12.1" => between(v"8.8", highest), + sm"12.1a" => between(v"8.8", highest), + sm"12.1f" => between(v"8.8", highest), ) +# Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`. function ptx_cap_support(ver::VersionNumber) - caps = Set{VersionNumber}() - for (cap,r) in ptx_cap_db + caps = Set{SMVersion}() + for (cap, r) in ptx_cap_db if ver in r push!(caps, cap) end @@ -201,41 +203,66 @@ function ptx_cap_support(ver::VersionNumber) return caps end +# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic. +function ptx_baseline_caps(ver::VersionNumber) + caps = Set{VersionNumber}() + for (cap, r) in ptx_cap_db + if cap.feature_set === :baseline && ver in r + push!(caps, base_version(cap)) + end + end + return caps +end + ## devices supported by the LLVM NVPTX back-end -# Source: LLVM/lib/Target/NVPTX/NVPTX.td -const llvm_cap_db = Dict( - v"2.0" => between(v"3.2", highest), - v"2.1" => between(v"3.2", highest), - v"3.0" => between(v"3.2", highest), - v"3.2" => between(v"3.7", highest), - v"3.5" => between(v"3.2", highest), - v"3.7" => between(v"3.7", highest), - v"5.0" => between(v"3.5", highest), - v"5.2" => between(v"3.7", highest), - v"5.3" => between(v"3.7", highest), - v"6.0" => between(v"3.9", highest), - v"6.1" => between(v"3.9", highest), - v"6.2" => between(v"3.9", highest), - v"7.0" => between(v"6", highest), - v"7.2" => between(v"7", highest), - v"7.5" => between(v"8", highest), - v"8.0" => between(v"11", highest), - v"8.6" => between(v"13", highest), - v"8.7" => between(v"16", highest), - v"8.9" => between(v"16", highest), - v"9.0" => between(v"16", highest), - v"10.0" => between(v"20", highest), - v"10.1" => between(v"20", highest), - v"10.3" => between(v"21", highest), - v"12.0" => between(v"20", highest), - v"12.1" => between(v"21", highest), +# Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up +# here as a separate entry; without an entry LLVM does not know the variant CPU name and +# constructing a TargetMachine with it would fall back to a generic subtarget. +const llvm_cap_db = Dict{SMVersion, VersionRange}( + sm"2.0" => between(v"3.2", highest), + sm"2.1" => between(v"3.2", highest), + sm"3.0" => between(v"3.2", highest), + sm"3.2" => between(v"3.7", highest), + sm"3.5" => between(v"3.2", highest), + sm"3.7" => between(v"3.7", highest), + sm"5.0" => between(v"3.5", highest), + sm"5.2" => between(v"3.7", highest), + sm"5.3" => between(v"3.7", highest), + sm"6.0" => between(v"3.9", highest), + sm"6.1" => between(v"3.9", highest), + sm"6.2" => between(v"3.9", highest), + sm"7.0" => between(v"6", highest), + sm"7.2" => between(v"7", highest), + sm"7.5" => between(v"8", highest), + sm"8.0" => between(v"11", highest), + sm"8.6" => between(v"13", highest), + sm"8.7" => between(v"16", highest), + sm"8.9" => between(v"16", highest), + sm"9.0" => between(v"16", highest), + sm"9.0a" => between(v"18", highest), + sm"10.0" => between(v"20", highest), + sm"10.0a" => between(v"20", highest), + sm"10.0f" => between(v"21", highest), + sm"10.1" => between(v"20", highest), + sm"10.1a" => between(v"20", highest), + sm"10.1f" => between(v"21", highest), + sm"10.3" => between(v"21", highest), + sm"10.3a" => between(v"21", highest), + sm"10.3f" => between(v"21", highest), + sm"12.0" => between(v"20", highest), + sm"12.0a" => between(v"20", highest), + sm"12.0f" => between(v"21", highest), + sm"12.1" => between(v"21", highest), + sm"12.1a" => between(v"21", highest), + sm"12.1f" => between(v"21", highest), ) +# Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`. function llvm_cap_support(ver::VersionNumber) - caps = Set{VersionNumber}() - for (cap,r) in llvm_cap_db + caps = Set{SMVersion}() + for (cap, r) in llvm_cap_db if ver in r push!(caps, cap) end @@ -243,6 +270,17 @@ function llvm_cap_support(ver::VersionNumber) return caps end +# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic. +function llvm_baseline_caps(ver::VersionNumber) + caps = Set{VersionNumber}() + for (cap, r) in llvm_cap_db + if cap.feature_set === :baseline && ver in r + push!(caps, base_version(cap)) + end + end + return caps +end + ## PTX ISAs supported by the LVM NVPTX back-end @@ -299,7 +337,10 @@ end function llvm_compat(version=LLVM.version()) LLVM.InitializeNVPTXTarget() - cap_support = sort(collect(llvm_cap_support(version))) + # the `.cap` field is used for the base-cap clamp in `_compiler_config`, so only + # baseline entries are surfaced here. Variant support is queried point-wise via + # `sm in llvm_cap_support(...)`. + cap_support = sort(collect(llvm_baseline_caps(version))) ptx_support = sort(collect(llvm_ptx_support(version))) return (cap=cap_support, ptx=ptx_support) @@ -326,5 +367,7 @@ function cuda_compat(runtime=runtime_version(), compiler=compiler_version()) end function ptx_compat(ptx) - return (cap=ptx_cap_support(ptx),) + # Baseline view for the clamp; variant support is queried point-wise via + # `sm in ptx_cap_support(...)`. + return (cap=ptx_baseline_caps(ptx),) end diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 736e7ef23e..9486603dc5 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -1,29 +1,17 @@ ## gpucompiler interface implementation Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams - cap::VersionNumber + sm::SMVersion ptx::VersionNumber - feature_set::Symbol = :baseline end function Base.hash(params::CUDACompilerParams, h::UInt) - h = hash(params.cap, h) + h = hash(params.sm, h) h = hash(params.ptx, h) - h = hash(params.feature_set, h) return h end -# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target` -# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch` -# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`; -# emitting the same string on both sides handles all three feature sets correctly. -function format_target(cap::VersionNumber, feature_set::Symbol) - suffix = feature_set === :arch ? "a" : - feature_set === :family ? "f" : "" - return "sm_$(cap.major)$(cap.minor)$suffix" -end - const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams} const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams} @@ -131,10 +119,10 @@ end # stamp `.version` with the ISA we want `ptxas` to validate against # and `.target` with the arch that `--gpu-name` will use -function rewrite_ptx_header(asm, ptx, cap, feature_set) +function rewrite_ptx_header(asm, ptx::VersionNumber, sm::SMVersion) return replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)", - r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))") + r"\.target sm_\d+\w*" => ".target $(cpu_name(sm))") end function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format) @@ -157,12 +145,16 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module asm = replace(asm, r"(\.target .+), debug" => s"\1") end - (; ptx, cap, feature_set) = job.config.params + # The rewrite stamps `.target`/`.version` with the *requested* (cuda-side) values. + # When the GPUCompiler-side target matches, LLVM already emits the right header + # (including the `a`/`f` suffix, via the CPU name); we only rewrite when they differ, + # e.g. when we had to clamp the target down for LLVM compatibility. + (; ptx, sm) = job.config.params needs_rewrite = job.config.target.ptx != ptx || - job.config.target.cap != cap || - feature_set !== :baseline + job.config.target.cap != base_version(sm) || + job.config.target.feature_set !== sm.feature_set if needs_rewrite - asm = rewrite_ptx_header(asm, ptx, cap, feature_set) + asm = rewrite_ptx_header(asm, ptx, sm) end return asm @@ -194,7 +186,14 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, feature_set=nothing, kwargs...) + cap=nothing, ptx=nothing, kwargs...) + # convert / deprecate VersionNumber cap + if cap isa VersionNumber + Base.depwarn("Passing a VersionNumber to `cap=` is deprecated; use the `sm\"$(cap.major).$(cap.minor)\"` string macro instead.", + :cufunction) + cap = SMVersion(cap.major, cap.minor, :baseline) + end + # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -225,9 +224,12 @@ end # determine the compute capabilities to use. this should match the capability of the # current device, but if LLVM doesn't support it, we can target an older capability - # and pass a different `-arch` to `ptxas`. + # and pass a different `-arch` to `ptxas`. The feature_set rides on `cap::SMVersion`; + # the clamp logic below operates on the base version, with the feature_set carried + # through unchanged. ptx_support = ptx_compat(cuda_ptx) - requested_cap = @something(cap, min(capability(dev), maximum(ptx_support.cap))) + base_cap = cap === nothing ? nothing : base_version(cap) + requested_cap = @something(base_cap, min(capability(dev), maximum(ptx_support.cap))) llvm_caps = filter(<=(requested_cap), llvm_support.cap) if cap !== nothing ## use the highest capability supported by LLVM @@ -235,30 +237,53 @@ end error("Requested compute capability $cap is not supported by LLVM $(LLVM.version())") llvm_cap = maximum(llvm_caps) ## use the capability as-is to invoke CUDA - cuda_cap = cap + cuda_sm = cap else ## use the highest capability supported by LLVM isempty(llvm_caps) && error("Compute capability $(requested_cap) is not supported by LLVM $(LLVM.version())") llvm_cap = maximum(llvm_caps) - ## use the highest capability supported by CUDA + ## use the highest capability supported by CUDA -- always baseline when defaulted + ## from the device (the device cap has no feature_set) cuda_caps = filter(<=(capability(dev)), cuda_support.cap) isempty(cuda_caps) && error("Compute capability $(requested_cap) is not supported by CUDA $(runtime_version())") - cuda_cap = maximum(cuda_caps) + cuda_sm = SMVersion(maximum(cuda_caps).major, maximum(cuda_caps).minor, :baseline) end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = compiler_version() >= v"11.7" - # Conservatively pick baseline for backward compatibility, - # requiring explicit opt-in for family- and architecture-specific instructions. - feature_set = something(feature_set, :baseline) - validate_feature_set(cuda_cap, cuda_ptx, feature_set) + # An invalid (cap, feature_set, ptx) variant combination simply has no entry in + # `ptx_cap_db`: e.g. `sm"7.0a"` (no `*a` below CC 9.0) or `sm"10.0f"` at PTX 8.7 + # (`*f` needs 8.8). Only enforced for variants -- baseline lets inspection tools + # like `code_ptx` mix `cap` and `ptx` freely, even if the result won't actually load. + if cuda_sm.feature_set !== :baseline + cuda_sm in ptx_cap_support(cuda_ptx) || + error("$(cpu_name(cuda_sm)) is not supported by PTX ISA $(cuda_ptx)") + end + + # GPUCompiler's PTXCompilerTarget needs an `sm` the *installed* LLVM understands + # (the `*a`/`*f` CPU names are version-gated, see `llvm_cap_db`). If LLVM is too + # old to know the variant for the LLVM-side cap, fall back to baseline; the post-mcgen + # rewrite then patches `.target` to the requested value -- which is enough for + # inline-PTX users but won't unlock LLVM's arch-accelerated code paths. ptxas is more + # permissive than LLVM about claiming a more-specific `.target` than the IR actually + # uses, so the upgrade is safe; the reverse would not be. + llvm_sm = SMVersion(llvm_cap.major, llvm_cap.minor, cuda_sm.feature_set) + if !(llvm_sm in llvm_cap_support(LLVM.version())) + if cuda_sm.feature_set !== :baseline + @warn "LLVM $(LLVM.version()) does not support $(cpu_name(llvm_sm)) " * + "(needed to compile for $(cpu_name(cuda_sm))); LLVM will emit baseline " * + "code (inline PTX assembly is unaffected)." + end + llvm_sm = SMVersion(llvm_sm.major, llvm_sm.minor, :baseline) + end # create GPUCompiler objects - target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) - params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set) + target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, + feature_set=llvm_sm.feature_set, debuginfo, kwargs...) + params = CUDACompilerParams(; sm=cuda_sm, ptx=cuda_ptx) CompilerConfig(target, params; kernel, name, always_inline) end @@ -293,8 +318,9 @@ function compile(@nospecialize(job::CompilerJob)) push!(ptxas_opts, "--compile-only") end - (; ptx, cap, feature_set) = job.config.params - arch = format_target(cap, feature_set) + (; ptx, sm) = job.config.params + cap = base_version(sm) + arch = cpu_name(sm) # validate use of parameter memory argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt @@ -307,7 +333,7 @@ function compile(@nospecialize(job::CompilerJob)) end if param_usage > param_limit msg = """Kernel invocation uses too much parameter memory. - $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by sm_$(cap.major)$(cap.minor) / PTX v$(ptx.major).$(ptx.minor).""" + $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by $(arch) / PTX v$(ptx.major).$(ptx.minor).""" try details = "\n\nRelevant parameters:" diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index 619acaa327..a67a3f61ed 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} = ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream] @@ -433,8 +433,11 @@ The following keyword arguments are supported: - `name`: override the name that the kernel will have in the generated code - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals -- `cap` and `ptx`: to override the compute capability and PTX version to compile for -- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch` +- `cap` and `ptx`: to override the compute capability and PTX version to compile for. + `cap` accepts an [`SMVersion`](@ref) via the `sm"..."` string macro, e.g. + `cap=sm"10.3a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"10.0f"` + for family-portable Blackwell codegen. The bare form `cap=sm"10.3"` selects baseline + (forward-compatible) codegen. Passing a `VersionNumber` is deprecated. The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl new file mode 100644 index 0000000000..703d798159 --- /dev/null +++ b/CUDACore/src/compiler/sm.jl @@ -0,0 +1,56 @@ +# SMVersion: a PTX compilation target, identifying a CUDA compute capability together +# with its subtarget feature set. +# +# Constructed via the `sm"..."` string macro, which accepts the dotted form used in +# CUDA's documentation: `sm"10.3a"` is compute capability 10.3 with architecture- +# accelerated features. The bare form `sm"10.3"` is the default, forward-compatible +# feature set (the "onion model"). +# +# See `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature +# definitions, and CUDA's PTX ISA documentation under `.target` for the runtime +# compatibility implications: +# +# :baseline (no suffix) - forward-compatible (sm_X for any sm_Y >= X) +# :family ('f' suffix) - same-major-family-portable +# :arch ('a' suffix) - locked to one exact CC + +export SMVersion, @sm_str + +struct SMVersion + major::Int + minor::Int + feature_set::Symbol + + function SMVersion(major::Integer, minor::Integer, feature_set::Symbol = :baseline) + feature_set in (:baseline, :family, :arch) || + error("SMVersion feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))") + return new(Int(major), Int(minor), feature_set) + end +end + +# Suffix on the LLVM CPU name / `.target` directive +suffix(sm::SMVersion) = sm.feature_set === :arch ? "a" : + sm.feature_set === :family ? "f" : "" + +# LLVM CPU / PTX `.target` name (e.g. "sm_103a"). +cpu_name(sm::SMVersion) = "sm_$(sm.major)$(sm.minor)$(suffix(sm))" + +# Drop the feature set to recover the base compute-capability `VersionNumber`, +# usable against the version-keyed compatibility databases. +base_version(sm::SMVersion) = VersionNumber(sm.major, sm.minor) + +Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"") + +function _parse_sm(s::AbstractString) + m = match(r"^(\d+)\.(\d+)([af]?)$", s) + m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"") + major = parse(Int, m.captures[1]) + minor = parse(Int, m.captures[2]) + fs = m.captures[3] == "a" ? :arch : + m.captures[3] == "f" ? :family : :baseline + return SMVersion(major, minor, fs) +end + +macro sm_str(s) + return _parse_sm(s) +end diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl index cd3687f0ea..4457d37330 100644 --- a/CUDACore/src/device/runtime.jl +++ b/CUDACore/src/device/runtime.jl @@ -19,7 +19,7 @@ function precompile_runtime() # NOTE: this often runs when we don't have a functioning set-up, # so we don't use `compiler_config` which requires NVML target = PTXCompilerTarget(; cap, ptx, debuginfo) - params = CUDACompilerParams(; cap, ptx) + params = CUDACompilerParams(; sm=SMVersion(cap.major, cap.minor), ptx) config = CompilerConfig(target, params) job = CompilerJob(mi, config) GPUCompiler.load_runtime(job) diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl index 58ae94f536..e145f63467 100644 --- a/CUDACore/src/precompile.jl +++ b/CUDACore/src/precompile.jl @@ -17,7 +17,7 @@ if :NVPTX in LLVM.backends() llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx)) target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true) - params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx) + params = CUDACompilerParams(; sm=SMVersion(llvm_cap.major, llvm_cap.minor), ptx=llvm_ptx) config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false) tt = Tuple{CuDeviceArray{Float32,1,AS.Global}} diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index 6049552626..e347cf8300 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -232,11 +232,33 @@ function main() - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" + + # Run once to validate: the result must be finite and the L1 sum must + # match a baked-in reference computed from this same StableRNG(123) + # seed. cuTile/perf/volumerhs.jl uses the same reference so the two + # implementations can be cross-checked. + CUDA.@sync kernel(rhs, Q, vgeo, DFloat(grav), D, nelem; + threads=threads, blocks=nelem) + rhs_h = Array(rhs) + @assert all(isfinite, rhs_h) "kernel produced non-finite values" + rsum = sum(rhs_h) + ref = 1.4227473f10 + rel = abs(rsum - ref) / abs(ref) + @assert rel < 1f-3 "rhs checksum off by $rel (got $rsum, expected $ref)" + @info "validation passed" rhs_sum=rsum reference=ref rel_err=rel + fill!(rhs, 0) + results = @benchmark begin + # zero rhs each iteration so accumulation stays meaningful + fill!($rhs, 0) CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; threads=$threads, blocks=$nelem) end + bytes = nelem * 28 * Nq^3 * sizeof(DFloat) + bw = bytes / (minimum(results).time / 1e9) / 1e9 + @info "SIMT volumerhs! benchmark" min_ms=minimum(results).time/1e6 median_ms=median(results).time/1e6 effective_BW="$(round(Int, bw)) GB/s" + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them CUDA.unsafe_free!(rhs) CUDA.unsafe_free!(Q) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 2096439b6b..ffac61be81 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -255,42 +255,74 @@ end @test !success(run_ptxas(asm_pre, "sm_75")) - asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline) + asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0") @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). - asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch) + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0a") @test occursin(".target sm_90a", asm_arch) @test success(run_ptxas(asm_arch, "sm_90a")) # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line. - asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family) + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"10.0f") @test occursin(".target sm_100f", asm_family) @test success(run_ptxas(asm_family, "sm_100f")) end +@testset "SMVersion and sm\"...\" macro" begin + @test sm"9.0" == SMVersion(9, 0, :baseline) + @test sm"9.0a" == SMVersion(9, 0, :arch) + @test sm"10.0f" == SMVersion(10, 0, :family) + # printing roundtrips via the macro form + @test sprint(show, sm"10.3a") == "sm\"10.3a\"" + @test sprint(show, sm"10.0") == "sm\"10.0\"" + # cpu_name reflects feature_set + @test CUDACore.cpu_name(sm"9.0") == "sm_90" + @test CUDACore.cpu_name(sm"9.0a") == "sm_90a" + @test CUDACore.cpu_name(sm"10.0f") == "sm_100f" + # base_version drops the suffix back to a comparable VersionNumber + @test CUDACore.base_version(sm"10.3a") == v"10.3" + # constructor rejects bogus feature_set + @test_throws ErrorException SMVersion(9, 0, :bogus) + # macro rejects malformed strings + @test_throws ErrorException CUDACore._parse_sm("103a") # missing dot + @test_throws ErrorException CUDACore._parse_sm("10.3x") # unknown suffix + @test_throws ErrorException CUDACore._parse_sm("10") # missing minor +end + @testset "CUDACompilerParams hash discriminates on feature_set" begin # Without feature_set in the hash, two params differing only on feature_set would collide # in the compiler cache and silently return a cubin compiled for the wrong feature set. - base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline) - arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch) + base = CUDACore.CUDACompilerParams(sm=sm"9.0", ptx=v"8.0") + arch = CUDACore.CUDACompilerParams(sm=sm"9.0a", ptx=v"8.0") @test hash(base) != hash(arch) @test base != arch end -@testset "validate_feature_set" begin - # Architecture-specific needs CC >= 9.0 and PTX >= 8.0 - # Family-specific needs CC >= 10.0 and PTX >= 8.8. - @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch) - @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch) - @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family) - @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family) - @test CUDACore.validate_feature_set(v"9.0", v"8.0", :arch) === nothing - @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing - @test CUDACore.validate_feature_set(v"5.0", v"6.2", :baseline) === nothing +@testset "ptx_cap_support" begin + # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"9.0") and PTX >= 8.0. + # Family-specific needs CC >= 10.0 (no `*f` keys below sm"10.0") and PTX >= 8.8. + @test !(sm"8.6a" in CUDACore.ptx_cap_support(v"8.0")) # no `*a` below CC 9.0 + @test !(sm"9.0a" in CUDACore.ptx_cap_support(v"7.8")) # `*a` requires PTX >= 8.0 + @test !(sm"9.0f" in CUDACore.ptx_cap_support(v"8.0")) # no `*f` below CC 10.0 + @test !(sm"10.0f" in CUDACore.ptx_cap_support(v"8.7")) # `*f` requires PTX >= 8.8 + @test sm"9.0a" in CUDACore.ptx_cap_support(v"8.0") + @test sm"10.0f" in CUDACore.ptx_cap_support(v"8.8") + @test sm"5.0" in CUDACore.ptx_cap_support(v"6.2") +end + +@testset "llvm_cap_support" begin + # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td. + @test sm"10.3a" in CUDACore.llvm_cap_support(v"21") + @test sm"10.0f" in CUDACore.llvm_cap_support(v"21") + @test sm"9.0a" in CUDACore.llvm_cap_support(v"18") + @test !(sm"9.0a" in CUDACore.llvm_cap_support(v"17")) # sm_90a added in LLVM 18 + @test !(sm"10.3a" in CUDACore.llvm_cap_support(v"20")) # sm_103a added in LLVM 21 + @test !(sm"10.0f" in CUDACore.llvm_cap_support(v"20")) # sm_100f added in LLVM 21 + @test sm"7.0" in CUDACore.llvm_cap_support(v"15") end end diff --git a/test/core/execution.jl b/test/core/execution.jl index edf589913d..f54da84bef 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -50,17 +50,40 @@ end @cuda threads=2 dummy() # sm_10 isn't supported by LLVM - @test_throws "not supported by LLVM" @cuda launch=false cap=v"1.0" dummy() + @test_throws "not supported by LLVM" @cuda launch=false cap=sm"1.0" dummy() # sm_20 is, but not by any CUDA version we support - @test_throws "Failed to compile PTX code" @cuda launch=false cap=v"2.0" dummy() + @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"2.0" dummy() # there isn't any capability other than the device's that's guaruanteed to work - @cuda launch=false cap=capability(device()) dummy() + dev_cap = capability(device()) + dev_sm = SMVersion(dev_cap.major, dev_cap.minor) + @cuda launch=false cap=dev_sm dummy() # but we should be able to see it in the generated PTX code - asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0")) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"5.0")) @test contains(asm, ".target sm_50") asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3")) @test contains(asm, ".version 6.3") + + # feature_set is selected by the suffix on the sm"..." string; the suffix should + # surface in the .target directive in the PTX output. + if dev_cap >= v"9.0" + sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_a)) + @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + # arch-specific cubin should also actually launch on the matching device + @cuda cap=sm_a dummy() + end + if dev_cap >= v"10.0" + sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_f)) + @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))") + @cuda cap=sm_f dummy() + end + + # passing a VersionNumber to `cap` is deprecated; check the depwarn fires while + # the path still produces the right PTX. (Uses code_ptx to skip ptxas, which on + # newer CUDA toolkits no longer accepts sm_50.) + @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0")) end From 9450ef74d5ab51704c5f7d258a22485e04709c01 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 11:02:32 +0200 Subject: [PATCH 07/16] Rework device compatibility handling. --- CUDACore/lib/cudadrv/state.jl | 2 +- CUDACore/src/compatibility.jl | 74 +++---------- CUDACore/src/compiler/compilation.jl | 123 +++++++++------------- CUDACore/src/compiler/sm.jl | 17 +++ CUDACore/src/device/intrinsics/version.jl | 17 ++- CUDACore/src/device/runtime.jl | 10 +- CUDACore/src/precompile.jl | 11 +- test/core/execution.jl | 46 +++++++- 8 files changed, 153 insertions(+), 147 deletions(-) diff --git a/CUDACore/lib/cudadrv/state.jl b/CUDACore/lib/cudadrv/state.jl index 6c03564bb8..32159136fa 100644 --- a/CUDACore/lib/cudadrv/state.jl +++ b/CUDACore/lib/cudadrv/state.jl @@ -227,7 +227,7 @@ function context(dev::CuDevice) maxlog=1, _id=devidx) end # ... or too new - if !in(capability(dev), cuda_compat().cap) + if !in(capability(dev), ptxas_compat().cap) @warn("""Your $(name(dev)) GPU (compute capability $(capability(dev).major).$(capability(dev).minor)) is not fully supported by CUDA $(runtime_version()). Some functionality may be broken. Ensure you are using the latest version of CUDA.jl in combination with an up-to-date NVIDIA driver. If that does not help, please file an issue to add support for the latest CUDA toolkit.""", diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 770d717834..71a0dfc0f0 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -36,12 +36,12 @@ Base.intersect(v::VersionNumber, r::VersionRange) = v > r.upper ? (v:r.upper) : (v:v) -## devices supported by the CUDA toolkit +## devices supported by ptxas # Source: # - https://en.wikipedia.org/wiki/CUDA#GPUs_supported # - ptxas |& grep -A 10 '\--gpu-name' -const cuda_cap_db = Dict( +const ptxas_cap_db = Dict( v"1.0" => between(lowest, v"6.5"), v"1.1" => between(lowest, v"6.5"), v"1.2" => between(lowest, v"6.5"), @@ -73,9 +73,9 @@ const cuda_cap_db = Dict( v"12.1" => between(v"12.9", highest), ) -function cuda_cap_support(ver::VersionNumber) +function ptxas_cap_support(ver::VersionNumber) caps = Set{VersionNumber}() - for (cap,r) in cuda_cap_db + for (cap,r) in ptxas_cap_db if ver in r push!(caps, cap) end @@ -84,10 +84,10 @@ function cuda_cap_support(ver::VersionNumber) end -## PTX ISAs supported by the CUDA toolkit +## PTX ISAs supported by ptxas # Source: PTX ISA document, Release History table -const cuda_ptx_db = Dict( +const ptxas_ptx_db = Dict( v"1.0" => between(v"1.0", highest), v"1.1" => between(v"1.1", highest), v"1.2" => between(v"2.0", highest), @@ -135,9 +135,9 @@ const cuda_ptx_db = Dict( v"9.2" => between(v"13.2", highest), ) -function cuda_ptx_support(ver::VersionNumber) +function ptxas_ptx_support(ver::VersionNumber) caps = Set{VersionNumber}() - for (cap,r) in cuda_ptx_db + for (cap,r) in ptxas_ptx_db if ver in r push!(caps, cap) end @@ -203,17 +203,6 @@ function ptx_cap_support(ver::VersionNumber) return caps end -# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic. -function ptx_baseline_caps(ver::VersionNumber) - caps = Set{VersionNumber}() - for (cap, r) in ptx_cap_db - if cap.feature_set === :baseline && ver in r - push!(caps, base_version(cap)) - end - end - return caps -end - ## devices supported by the LLVM NVPTX back-end @@ -270,17 +259,6 @@ function llvm_cap_support(ver::VersionNumber) return caps end -# Baseline-only view, returned as `VersionNumber`s for use by the cap-clamp logic. -function llvm_baseline_caps(ver::VersionNumber) - caps = Set{VersionNumber}() - for (cap, r) in llvm_cap_db - if cap.feature_set === :baseline && ver in r - push!(caps, base_version(cap)) - end - end - return caps -end - ## PTX ISAs supported by the LVM NVPTX back-end @@ -337,37 +315,11 @@ end function llvm_compat(version=LLVM.version()) LLVM.InitializeNVPTXTarget() - # the `.cap` field is used for the base-cap clamp in `_compiler_config`, so only - # baseline entries are surfaced here. Variant support is queried point-wise via - # `sm in llvm_cap_support(...)`. - cap_support = sort(collect(llvm_baseline_caps(version))) - ptx_support = sort(collect(llvm_ptx_support(version))) - - return (cap=cap_support, ptx=ptx_support) -end - -function cuda_compat(runtime=runtime_version(), compiler=compiler_version()) - # we don't have to check the driver version, because it offers backwards compatbility - # beyond the CUDA toolkit version (e.g. R580 for CUDA 13 still supports Volta as - # deprecated in CUDA 13), and we don't have a reliable way to query the actual version - # as NVML isn't available on all platforms. let's instead simply assume that unsupported - # devices will not be exposed to the CUDA runtime and thus won't be visible to us. - - # the compiler and runtime are versioned independently (and either can come from a - # local install), so we need to consider both: - # - device caps are dropped when either ptxas can't emit for them or the runtime - # libraries drop them. take the intersection of both supported sets. - # - PTX ISA availability is a property of ptxas; the runtime doesn't care which ISA - # compiled cubin came from. - cap_support = sort(collect(intersect(cuda_cap_support(runtime), - cuda_cap_support(compiler)))) - ptx_support = sort(collect(cuda_ptx_support(compiler))) - - return (cap=cap_support, ptx=ptx_support) + return (cap=llvm_cap_support(version), + ptx=llvm_ptx_support(version)) end -function ptx_compat(ptx) - # Baseline view for the clamp; variant support is queried point-wise via - # `sm in ptx_cap_support(...)`. - return (cap=ptx_baseline_caps(ptx),) +function ptxas_compat(version=compiler_version()) + return (cap=ptxas_cap_support(version), + ptx=ptxas_ptx_support(version)) end diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 9486603dc5..32ff9f3375 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -194,96 +194,73 @@ end cap = SMVersion(cap.major, cap.minor, :baseline) end - # determine the toolchain + # inspect the toolchain llvm_support = llvm_compat() - cuda_support = cuda_compat() + ptxas_support = ptxas_compat() - # determine the PTX ISA to use. we want at least 6.2, but will use newer if possible. - requested_ptx = something(ptx, v"6.2") - llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx) - cuda_ptxs = filter(>=(requested_ptx), cuda_support.ptx) + # determine the PTX ISA to use. if ptx !== nothing - # the user requested a specific PTX ISA - ## use the highest ISA supported by LLVM - isempty(llvm_ptxs) && + # explicit request: take it exactly, validating against the toolchain + ptx in llvm_support.ptx || error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())") - llvm_ptx = maximum(llvm_ptxs) - ## use the ISA as-is to invoke CUDA - cuda_ptx = ptx + ptx in ptxas_support.ptx || + error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())") + llvm_ptx = ptxas_ptx = ptx else - # try to do the best thing (i.e., use the newest PTX ISA) - # XXX: is it safe to just use the latest PTX ISA? isn't it possible for, e.g., - # instructions to get deprecated? + # default: pick the newest PTX ISA supported by the toolchain (>=v6.2) + requested_ptx = v"6.2" + llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx) + ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx) isempty(llvm_ptxs) && error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())") - llvm_ptx = maximum(llvm_ptxs) - isempty(cuda_ptxs) && - error("CUDA.jl requires PTX $requested_ptx, which is not supported by CUDA $(compiler_version())") - cuda_ptx = maximum(cuda_ptxs) + isempty(ptxas_ptxs) && + error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())") + ptxas_ptx = maximum(ptxas_ptxs) + llvm_ptx = min(maximum(llvm_ptxs), ptxas_ptx) end - # determine the compute capabilities to use. this should match the capability of the - # current device, but if LLVM doesn't support it, we can target an older capability - # and pass a different `-arch` to `ptxas`. The feature_set rides on `cap::SMVersion`; - # the clamp logic below operates on the base version, with the feature_set carried - # through unchanged. - ptx_support = ptx_compat(cuda_ptx) - base_cap = cap === nothing ? nothing : base_version(cap) - requested_cap = @something(base_cap, min(capability(dev), maximum(ptx_support.cap))) - llvm_caps = filter(<=(requested_cap), llvm_support.cap) + # when selecting compute capabilities, we prefer the most recent one, as + # well as prefer to use architecture-accelerated features when available. + fs_rank(fs::Symbol) = fs === :arch ? 2 : fs === :family ? 1 : 0 + sm_key(sm::SMVersion) = (base_version(sm), fs_rank(sm.feature_set)) + + # determine the compute capability to use. + ## ptxas + ptx_caps = ptx_cap_support(ptxas_ptx) if cap !== nothing - ## use the highest capability supported by LLVM - isempty(llvm_caps) && - error("Requested compute capability $cap is not supported by LLVM $(LLVM.version())") - llvm_cap = maximum(llvm_caps) - ## use the capability as-is to invoke CUDA - cuda_sm = cap + # explicit request: take it as-is, validating against the PTX ISA + cap in ptx_caps || + error("$(cpu_name(cap)) is not supported by PTX ISA $(ptxas_ptx)") + ptxas_sm = cap else - ## use the highest capability supported by LLVM - isempty(llvm_caps) && - error("Compute capability $(requested_cap) is not supported by LLVM $(LLVM.version())") - llvm_cap = maximum(llvm_caps) - ## use the highest capability supported by CUDA -- always baseline when defaulted - ## from the device (the device cap has no feature_set) - cuda_caps = filter(<=(capability(dev)), cuda_support.cap) - isempty(cuda_caps) && - error("Compute capability $(requested_cap) is not supported by CUDA $(runtime_version())") - cuda_sm = SMVersion(maximum(cuda_caps).major, maximum(cuda_caps).minor, :baseline) + # pick the most specific capability the selected PTX ISA supports whose cubin + # would actually load on the current device. For baseline that's the onion model; + # `:arch` requires an exact CC match, `:family` a same-family match. + ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_caps) + isempty(ptxas_candidates) && + error("Compute capability $(capability(dev)) is not supported by ptxas " * + "$(compiler_version()) at PTX ISA $(ptxas_ptx)") + ptxas_sm = argmax(sm_key, ptxas_candidates) end - - # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 - debuginfo = compiler_version() >= v"11.7" - - # An invalid (cap, feature_set, ptx) variant combination simply has no entry in - # `ptx_cap_db`: e.g. `sm"7.0a"` (no `*a` below CC 9.0) or `sm"10.0f"` at PTX 8.7 - # (`*f` needs 8.8). Only enforced for variants -- baseline lets inspection tools - # like `code_ptx` mix `cap` and `ptx` freely, even if the result won't actually load. - if cuda_sm.feature_set !== :baseline - cuda_sm in ptx_cap_support(cuda_ptx) || - error("$(cpu_name(cuda_sm)) is not supported by PTX ISA $(cuda_ptx)") - end - - # GPUCompiler's PTXCompilerTarget needs an `sm` the *installed* LLVM understands - # (the `*a`/`*f` CPU names are version-gated, see `llvm_cap_db`). If LLVM is too - # old to know the variant for the LLVM-side cap, fall back to baseline; the post-mcgen - # rewrite then patches `.target` to the requested value -- which is enough for - # inline-PTX users but won't unlock LLVM's arch-accelerated code paths. ptxas is more - # permissive than LLVM about claiming a more-specific `.target` than the IR actually - # uses, so the upgrade is safe; the reverse would not be. - llvm_sm = SMVersion(llvm_cap.major, llvm_cap.minor, cuda_sm.feature_set) - if !(llvm_sm in llvm_cap_support(LLVM.version())) - if cuda_sm.feature_set !== :baseline - @warn "LLVM $(LLVM.version()) does not support $(cpu_name(llvm_sm)) " * - "(needed to compile for $(cpu_name(cuda_sm))); LLVM will emit baseline " * - "code (inline PTX assembly is unaffected)." + ## LLVM + if ptxas_sm in llvm_support.cap + llvm_sm = ptxas_sm + else + # Exact `ptxas_sm` unavailable in LLVM. Fall back to baseline LLVM at a + # lower base, since arch/family features don't carry across versions. + baseline_candidates = filter(llvm_support.cap) do sm + sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm) end - llvm_sm = SMVersion(llvm_sm.major, llvm_sm.minor, :baseline) + isempty(baseline_candidates) && + error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())") + llvm_sm = argmax(sm_key, baseline_candidates) end # create GPUCompiler objects target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, - feature_set=llvm_sm.feature_set, debuginfo, kwargs...) - params = CUDACompilerParams(; sm=cuda_sm, ptx=cuda_ptx) + feature_set=llvm_sm.feature_set, + debuginfo=true, kwargs...) + params = CUDACompilerParams(; sm=ptxas_sm, ptx=ptxas_ptx) CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl index 703d798159..a96849d72f 100644 --- a/CUDACore/src/compiler/sm.jl +++ b/CUDACore/src/compiler/sm.jl @@ -39,6 +39,23 @@ cpu_name(sm::SMVersion) = "sm_$(sm.major)$(sm.minor)$(suffix(sm))" # usable against the version-keyed compatibility databases. base_version(sm::SMVersion) = VersionNumber(sm.major, sm.minor) +# Would a cubin compiled for `sm` actually load and run on a device with capability +# `dev_cap`? Per NVIDIA's PTX ISA reference (.target directive): +# - baseline: forward-compatible (onion model) -- any sm_X runs on sm_Y for Y >= X. +# - family: same architecture family (currently == same major) and forward-portable +# within the family. +# - arch: locked to one exact CC; cubin only loads on devices with that exact cap. +function runs_on(sm::SMVersion, dev_cap::VersionNumber) + if sm.feature_set === :arch + return base_version(sm) == dev_cap + elseif sm.feature_set === :family + return sm.major == dev_cap.major && base_version(sm) <= dev_cap + else # :baseline + return base_version(sm) <= dev_cap + end +end + + Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"") function _parse_sm(s::AbstractString) diff --git a/CUDACore/src/device/intrinsics/version.jl b/CUDACore/src/device/intrinsics/version.jl index 6114833eeb..ea66448bda 100644 --- a/CUDACore/src/device/intrinsics/version.jl +++ b/CUDACore/src/device/intrinsics/version.jl @@ -1,8 +1,8 @@ # device intrinsics for querying the compute SimpleVersion and PTX ISA version -export compute_capability, ptx_isa_version +export compute_capability, ptx_isa_version, target_feature_set -for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"] +for var in ["sm_major", "sm_minor", "sm_features", "ptx_major", "ptx_minor"] @eval @device_function @inline $(Symbol(var))() = Base.llvmcall( $("""@$var = external global i32 @@ -17,3 +17,16 @@ end @device_function @inline compute_capability() = SimpleVersion(sm_major(), sm_minor()) @device_function @inline ptx_isa_version() = SimpleVersion(ptx_major(), ptx_minor()) +# Feature set encoded in the `.target` directive: one of `:baseline`, `:family`, `:arch`. +# (NVIDIA's PTX ISA reference: ".target specifies the set of features in the target +# architecture for which the current PTX code was generated.") GPUCompiler stamps the +# encoding in via the `sm_features` LLVM global, using `GPUCompiler.TargetFeatureSet`; +# the integer load + chained compare folds away after LLVM inlines the constant, so +# user code like `if target_feature_set() === :arch ... end` resolves to a single +# branch in the PTX output. +@device_function @inline function target_feature_set() + f = sm_features() + return f == UInt32(GPUCompiler.ArchFeatures) ? :arch : + f == UInt32(GPUCompiler.FamilyFeatures) ? :family : :baseline +end + diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl index 4457d37330..93e0cd90c5 100644 --- a/CUDACore/src/device/runtime.jl +++ b/CUDACore/src/device/runtime.jl @@ -12,14 +12,16 @@ function precompile_runtime() f = ()->return mi = methodinstance(typeof(f), Tuple{}) - caps = llvm_compat().cap + # `.cap` is now keyed by `SMVersion` and includes variants; runtime caches are + # feature_set-agnostic, so we only warm the baseline entries. + sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().cap) ptx = maximum(llvm_compat().ptx) JuliaContext() do ctx - for cap in caps, debuginfo in [false, true] + for sm in sms, debuginfo in [false, true] # NOTE: this often runs when we don't have a functioning set-up, # so we don't use `compiler_config` which requires NVML - target = PTXCompilerTarget(; cap, ptx, debuginfo) - params = CUDACompilerParams(; sm=SMVersion(cap.major, cap.minor), ptx) + target = PTXCompilerTarget(; cap=base_version(sm), ptx, debuginfo) + params = CUDACompilerParams(; sm, ptx) config = CompilerConfig(target, params) job = CompilerJob(mi, config) GPUCompiler.load_runtime(job) diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl index e145f63467..a5c39b1cd8 100644 --- a/CUDACore/src/precompile.jl +++ b/CUDACore/src/precompile.jl @@ -13,11 +13,16 @@ if :NVPTX in LLVM.backends() end llvm_support = llvm_compat() - llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap)) + # `.cap` is keyed by `SMVersion` and includes variants; pick the highest + # baseline cap <= v"7.5" for a portable precompile artifact. + llvm_sm = argmax(base_version, + filter(sm -> sm.feature_set === :baseline && + base_version(sm) <= v"7.5", + llvm_support.cap)) llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx)) - target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true) - params = CUDACompilerParams(; sm=SMVersion(llvm_cap.major, llvm_cap.minor), ptx=llvm_ptx) + target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, debuginfo=true) + params = CUDACompilerParams(; sm=llvm_sm, ptx=llvm_ptx) config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false) tt = Tuple{CuDeviceArray{Float32,1,AS.Global}} diff --git a/test/core/execution.jl b/test/core/execution.jl index f54da84bef..ddb871b936 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -61,20 +61,30 @@ end asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"5.0")) @test contains(asm, ".target sm_50") + # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the + # `.version` line should match what was asked for, independently of what LLVM and + # ptxas would natively pick. asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3")) @test contains(asm, ".version 6.3") + # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it + # used to be); a clearly out-of-range value must error at config time. + @test_throws "not supported" @cuda launch=false ptx=v"99.0" dummy() + # feature_set is selected by the suffix on the sm"..." string; the suffix should - # surface in the .target directive in the PTX output. + # surface in the .target directive in the PTX output. The cuda-side `.target` is + # the variant regardless of LLVM support -- the mcgen rewrite stamps it in even + # when LLVM clamped to baseline for codegen. + sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch) + sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) + if dev_cap >= v"9.0" - sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch) asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_a)) @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") # arch-specific cubin should also actually launch on the matching device @cuda cap=sm_a dummy() end if dev_cap >= v"10.0" - sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_f)) @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))") @cuda cap=sm_f dummy() @@ -84,6 +94,36 @@ end # the path still produces the right PTX. (Uses code_ptx to skip ptxas, which on # newer CUDA toolkits no longer accepts sm_50.) @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0")) + + # With no explicit `cap=`, we default to architecture-specific code paths on CC >=9.0 + # since we know the exact device. The cuda-side `.target` is the variant regardless of + # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs. + if dev_cap >= v"9.0" + asm = sprint(io->CUDA.code_ptx(io, dummy, ())) + @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + end + + # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built + # for (not the cuda-side .target): when LLVM doesn't natively support the exact variant, + # we fall back to baseline LLVM, so the global reflects baseline. The if-chain folds at + # codegen time, so the launched kernel writes a single constant. + function read_feature_set!(out) + @inbounds out[1] = if target_feature_set() === :arch + UInt32(2) + elseif target_feature_set() === :family + UInt32(1) + else + UInt32(0) + end + return + end + out = CuArray{UInt32}([typemax(UInt32)]) + @cuda threads=1 read_feature_set!(out) + # arch features come through `target_feature_set()` only when LLVM natively supported + # the variant; otherwise we fell back to baseline LLVM and the global reflects that. + arch_in_llvm = sm_a in CUDACore.llvm_cap_support(CUDACore.LLVM.version()) + expected = dev_cap >= v"9.0" && arch_in_llvm ? UInt32(2) : UInt32(0) + @test Array(out)[1] == expected end From 5e385829a24832dc81e69af09ba7302c046956ec Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 11:41:27 +0200 Subject: [PATCH 08/16] Report on detected compiler support. --- CUDATools/src/utilities.jl | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/CUDATools/src/utilities.jl b/CUDATools/src/utilities.jl index 079ad71eb3..c14453cb7a 100644 --- a/CUDATools/src/utilities.jl +++ b/CUDATools/src/utilities.jl @@ -142,5 +142,27 @@ function versioninfo(io::IO=stdout) query_cuda() end println(io, " $(i-1): $str (sm_$(cap.major)$(cap.minor), $(Base.format_bytes(mem.free)) / $(Base.format_bytes(mem.total)) available)") + + # report the default compilation target we'd select for this device + config = try + CUDACore.compiler_config(dev) + catch + nothing + end + if config !== nothing + ptxas_sm = config.params.sm + ptxas_ptx = config.params.ptx + llvm_sm = CUDACore.SMVersion(config.target.cap.major, + config.target.cap.minor, + config.target.feature_set) + llvm_ptx = config.target.ptx + ptxas_str = "$(CUDACore.cpu_name(ptxas_sm)) / PTX $(ptxas_ptx.major).$(ptxas_ptx.minor)" + if llvm_sm == ptxas_sm && llvm_ptx == ptxas_ptx + println(io, " compiles to $ptxas_str") + else + llvm_str = "$(CUDACore.cpu_name(llvm_sm)) / PTX $(llvm_ptx.major).$(llvm_ptx.minor)" + println(io, " compiles to $ptxas_str (LLVM: $llvm_str)") + end + end end end From 2158f425620a92922d1cc7ee9584175d4da86f61 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 11:45:38 +0200 Subject: [PATCH 09/16] Bump GPUCompiler. --- CUDACore/Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index bab293aba3..2c9992781e 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.10" +GPUCompiler = "1.12" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" From fba88cd0a823f2dda98b570e426ecafc5a20b815 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 12:58:27 +0200 Subject: [PATCH 10/16] Improve test. --- CUDACore/src/compiler/sm.jl | 22 +++++++++++----------- test/core/codegen.jl | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl index a96849d72f..c29f2783a3 100644 --- a/CUDACore/src/compiler/sm.jl +++ b/CUDACore/src/compiler/sm.jl @@ -28,6 +28,16 @@ struct SMVersion end end +function Base.parse(::Type{SMVersion}, s::AbstractString) + m = match(r"^(\d+)\.(\d+)([af]?)$", s) + m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"") + major = parse(Int, m.captures[1]) + minor = parse(Int, m.captures[2]) + fs = m.captures[3] == "a" ? :arch : + m.captures[3] == "f" ? :family : :baseline + return SMVersion(major, minor, fs) +end + # Suffix on the LLVM CPU name / `.target` directive suffix(sm::SMVersion) = sm.feature_set === :arch ? "a" : sm.feature_set === :family ? "f" : "" @@ -58,16 +68,6 @@ end Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"") -function _parse_sm(s::AbstractString) - m = match(r"^(\d+)\.(\d+)([af]?)$", s) - m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"") - major = parse(Int, m.captures[1]) - minor = parse(Int, m.captures[2]) - fs = m.captures[3] == "a" ? :arch : - m.captures[3] == "f" ? :family : :baseline - return SMVersion(major, minor, fs) -end - macro sm_str(s) - return _parse_sm(s) + return :(Base.parse($SMVersion, $(esc(s)))) end diff --git a/test/core/codegen.jl b/test/core/codegen.jl index ffac61be81..d818741d2c 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -288,9 +288,9 @@ end # constructor rejects bogus feature_set @test_throws ErrorException SMVersion(9, 0, :bogus) # macro rejects malformed strings - @test_throws ErrorException CUDACore._parse_sm("103a") # missing dot - @test_throws ErrorException CUDACore._parse_sm("10.3x") # unknown suffix - @test_throws ErrorException CUDACore._parse_sm("10") # missing minor + @test_throws ErrorException parse(SMVersion, "103a") # missing dot + @test_throws ErrorException parse(SMVersion, "10.3x") # unknown suffix + @test_throws ErrorException parse(SMVersion, "10") # missing minor end @testset "CUDACompilerParams hash discriminates on feature_set" begin From 8250be81d7c0f02f1f762744cdfcd1c01548c689 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 14:53:08 +0200 Subject: [PATCH 11/16] Add docs. --- CUDACore/src/compiler/sm.jl | 72 ++++++++++++++++++++++++++++--------- docs/src/api/compiler.md | 8 +++++ 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl index c29f2783a3..eeebe68f73 100644 --- a/CUDACore/src/compiler/sm.jl +++ b/CUDACore/src/compiler/sm.jl @@ -1,21 +1,45 @@ -# SMVersion: a PTX compilation target, identifying a CUDA compute capability together -# with its subtarget feature set. -# -# Constructed via the `sm"..."` string macro, which accepts the dotted form used in -# CUDA's documentation: `sm"10.3a"` is compute capability 10.3 with architecture- -# accelerated features. The bare form `sm"10.3"` is the default, forward-compatible -# feature set (the "onion model"). -# -# See `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature -# definitions, and CUDA's PTX ISA documentation under `.target` for the runtime -# compatibility implications: -# -# :baseline (no suffix) - forward-compatible (sm_X for any sm_Y >= X) -# :family ('f' suffix) - same-major-family-portable -# :arch ('a' suffix) - locked to one exact CC - export SMVersion, @sm_str +""" + SMVersion(major, minor, [feature_set]) + +A PTX compilation target, identifying a CUDA compute capability together with the +subtarget feature set selected by the suffix on its `.target` directive. + +`feature_set` is one of: + +- `:baseline` (no suffix, e.g. `sm_90`) — forward-compatible (the "onion model"): + PTX compiled for `sm_X` runs on any `sm_Y` with `Y >= X`. +- `:family` (`f` suffix, e.g. `sm_100f`) — same-major-family-portable: PTX runs on + any device in the same architecture family (currently == same major version) at + or above this CC. +- `:arch` (`a` suffix, e.g. `sm_90a`) — locked to one exact CC: PTX runs only on + devices with exactly this compute capability, but in exchange gets access to + architecture-accelerated features. + +See NVIDIA's PTX ISA reference under `.target` for the full compatibility rules, +and `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature +definitions. + +Public fields: +- `sm.major::Int` +- `sm.minor::Int` +- `sm.feature_set::Symbol` + +See also [`@sm_str`](@ref) for an ergonomic string-macro constructor. + +# Examples +```julia +julia> SMVersion(9, 0) # baseline +sm"9.0" + +julia> SMVersion(9, 0, :arch) +sm"9.0a" + +julia> sm"10.0f" == SMVersion(10, 0, :family) +true +``` +""" struct SMVersion major::Int minor::Int @@ -68,6 +92,22 @@ end Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"") +""" + @sm_str + +String macro used to parse a string to an [`SMVersion`](@ref). Accepts the dotted form +used in NVIDIA's PTX ISA reference: `sm"9.0"` for baseline, `sm"9.0a"` for +architecture-accelerated, `sm"10.0f"` for family-specific. + +# Examples +```julia +julia> sm"10.3a" +sm"10.3a" + +julia> sm"10.0f" == SMVersion(10, 0, :family) +true +``` +""" macro sm_str(s) return :(Base.parse($SMVersion, $(esc(s)))) end diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md index a7ce178a3e..a2c493d789 100644 --- a/docs/src/api/compiler.md +++ b/docs/src/api/compiler.md @@ -25,6 +25,14 @@ registers memory ``` +The PTX compilation target is identified by an `SMVersion`, constructed via the +`sm"..."` string macro: + +```@docs +SMVersion +@sm_str +``` + To plug in alternative compiler back-ends (e.g. cuTile.jl), `@cuda` dispatches through a small protocol: From aacd3b432ef256765af599270b514c620403935e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 14:59:06 +0200 Subject: [PATCH 12/16] Switch to a more NVIDIA-like SM string. --- CUDACore/src/compatibility.jl | 150 ++++++++++++++--------------- CUDACore/src/compiler/execution.jl | 4 +- CUDACore/src/compiler/sm.jl | 34 ++++--- test/core/codegen.jl | 60 ++++++------ test/core/execution.jl | 6 +- 5 files changed, 131 insertions(+), 123 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 71a0dfc0f0..f8190d39cf 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -151,45 +151,45 @@ end # Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants # were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8. const ptx_cap_db = Dict{SMVersion, VersionRange}( - sm"1.0" => between(v"1.0", highest), - sm"1.1" => between(v"1.0", highest), - sm"1.2" => between(v"1.2", highest), - sm"1.3" => between(v"1.2", highest), - sm"2.0" => between(v"2.0", highest), - sm"3.0" => between(v"3.1", highest), - sm"3.2" => between(v"4.0", highest), - sm"3.5" => between(v"3.1", highest), - sm"3.7" => between(v"4.1", highest), - sm"5.0" => between(v"4.0", highest), - sm"5.2" => between(v"4.1", highest), - sm"5.3" => between(v"4.2", highest), - sm"6.0" => between(v"5.0", highest), - sm"6.1" => between(v"5.0", highest), - sm"6.2" => between(v"5.0", highest), - sm"7.0" => between(v"6.0", highest), - sm"7.2" => between(v"6.1", highest), - sm"7.5" => between(v"6.3", highest), - sm"8.0" => between(v"7.0", highest), - sm"8.6" => between(v"7.1", highest), - sm"8.7" => between(v"7.4", highest), - sm"8.9" => between(v"7.8", highest), - sm"9.0" => between(v"7.8", highest), - sm"9.0a" => between(v"8.0", highest), - sm"10.0" => between(v"8.6", highest), - sm"10.0a" => between(v"8.6", highest), - sm"10.0f" => between(v"8.8", highest), - sm"10.1" => between(v"8.6", highest), - sm"10.1a" => between(v"8.6", highest), - sm"10.1f" => between(v"8.8", highest), - sm"10.3" => between(v"8.8", highest), - sm"10.3a" => between(v"8.8", highest), - sm"10.3f" => between(v"8.8", highest), - sm"12.0" => between(v"8.7", highest), - sm"12.0a" => between(v"8.7", highest), - sm"12.0f" => between(v"8.8", highest), - sm"12.1" => between(v"8.8", highest), - sm"12.1a" => between(v"8.8", highest), - sm"12.1f" => between(v"8.8", highest), + sm"10" => between(v"1.0", highest), + sm"11" => between(v"1.0", highest), + sm"12" => between(v"1.2", highest), + sm"13" => between(v"1.2", highest), + sm"20" => between(v"2.0", highest), + sm"30" => between(v"3.1", highest), + sm"32" => between(v"4.0", highest), + sm"35" => between(v"3.1", highest), + sm"37" => between(v"4.1", highest), + sm"50" => between(v"4.0", highest), + sm"52" => between(v"4.1", highest), + sm"53" => between(v"4.2", highest), + sm"60" => between(v"5.0", highest), + sm"61" => between(v"5.0", highest), + sm"62" => between(v"5.0", highest), + sm"70" => between(v"6.0", highest), + sm"72" => between(v"6.1", highest), + sm"75" => between(v"6.3", highest), + sm"80" => between(v"7.0", highest), + sm"86" => between(v"7.1", highest), + sm"87" => between(v"7.4", highest), + sm"89" => between(v"7.8", highest), + sm"90" => between(v"7.8", highest), + sm"90a" => between(v"8.0", highest), + sm"100" => between(v"8.6", highest), + sm"100a" => between(v"8.6", highest), + sm"100f" => between(v"8.8", highest), + sm"101" => between(v"8.6", highest), + sm"101a" => between(v"8.6", highest), + sm"101f" => between(v"8.8", highest), + sm"103" => between(v"8.8", highest), + sm"103a" => between(v"8.8", highest), + sm"103f" => between(v"8.8", highest), + sm"120" => between(v"8.7", highest), + sm"120a" => between(v"8.7", highest), + sm"120f" => between(v"8.8", highest), + sm"121" => between(v"8.8", highest), + sm"121a" => between(v"8.8", highest), + sm"121f" => between(v"8.8", highest), ) # Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`. @@ -210,42 +210,42 @@ end # here as a separate entry; without an entry LLVM does not know the variant CPU name and # constructing a TargetMachine with it would fall back to a generic subtarget. const llvm_cap_db = Dict{SMVersion, VersionRange}( - sm"2.0" => between(v"3.2", highest), - sm"2.1" => between(v"3.2", highest), - sm"3.0" => between(v"3.2", highest), - sm"3.2" => between(v"3.7", highest), - sm"3.5" => between(v"3.2", highest), - sm"3.7" => between(v"3.7", highest), - sm"5.0" => between(v"3.5", highest), - sm"5.2" => between(v"3.7", highest), - sm"5.3" => between(v"3.7", highest), - sm"6.0" => between(v"3.9", highest), - sm"6.1" => between(v"3.9", highest), - sm"6.2" => between(v"3.9", highest), - sm"7.0" => between(v"6", highest), - sm"7.2" => between(v"7", highest), - sm"7.5" => between(v"8", highest), - sm"8.0" => between(v"11", highest), - sm"8.6" => between(v"13", highest), - sm"8.7" => between(v"16", highest), - sm"8.9" => between(v"16", highest), - sm"9.0" => between(v"16", highest), - sm"9.0a" => between(v"18", highest), - sm"10.0" => between(v"20", highest), - sm"10.0a" => between(v"20", highest), - sm"10.0f" => between(v"21", highest), - sm"10.1" => between(v"20", highest), - sm"10.1a" => between(v"20", highest), - sm"10.1f" => between(v"21", highest), - sm"10.3" => between(v"21", highest), - sm"10.3a" => between(v"21", highest), - sm"10.3f" => between(v"21", highest), - sm"12.0" => between(v"20", highest), - sm"12.0a" => between(v"20", highest), - sm"12.0f" => between(v"21", highest), - sm"12.1" => between(v"21", highest), - sm"12.1a" => between(v"21", highest), - sm"12.1f" => between(v"21", highest), + sm"20" => between(v"3.2", highest), + sm"21" => between(v"3.2", highest), + sm"30" => between(v"3.2", highest), + sm"32" => between(v"3.7", highest), + sm"35" => between(v"3.2", highest), + sm"37" => between(v"3.7", highest), + sm"50" => between(v"3.5", highest), + sm"52" => between(v"3.7", highest), + sm"53" => between(v"3.7", highest), + sm"60" => between(v"3.9", highest), + sm"61" => between(v"3.9", highest), + sm"62" => between(v"3.9", highest), + sm"70" => between(v"6", highest), + sm"72" => between(v"7", highest), + sm"75" => between(v"8", highest), + sm"80" => between(v"11", highest), + sm"86" => between(v"13", highest), + sm"87" => between(v"16", highest), + sm"89" => between(v"16", highest), + sm"90" => between(v"16", highest), + sm"90a" => between(v"18", highest), + sm"100" => between(v"20", highest), + sm"100a" => between(v"20", highest), + sm"100f" => between(v"21", highest), + sm"101" => between(v"20", highest), + sm"101a" => between(v"20", highest), + sm"101f" => between(v"21", highest), + sm"103" => between(v"21", highest), + sm"103a" => between(v"21", highest), + sm"103f" => between(v"21", highest), + sm"120" => between(v"20", highest), + sm"120a" => between(v"20", highest), + sm"120f" => between(v"21", highest), + sm"121" => between(v"21", highest), + sm"121a" => between(v"21", highest), + sm"121f" => between(v"21", highest), ) # Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`. diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index a67a3f61ed..f42de56402 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -435,8 +435,8 @@ The following keyword arguments are supported: - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for. `cap` accepts an [`SMVersion`](@ref) via the `sm"..."` string macro, e.g. - `cap=sm"10.3a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"10.0f"` - for family-portable Blackwell codegen. The bare form `cap=sm"10.3"` selects baseline + `cap=sm"103a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"100f"` + for family-portable Blackwell codegen. The bare form `cap=sm"103"` selects baseline (forward-compatible) codegen. Passing a `VersionNumber` is deprecated. The output of this function is automatically cached, i.e. you can simply call `cufunction` diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl index eeebe68f73..1b3cdb9734 100644 --- a/CUDACore/src/compiler/sm.jl +++ b/CUDACore/src/compiler/sm.jl @@ -4,7 +4,11 @@ export SMVersion, @sm_str SMVersion(major, minor, [feature_set]) A PTX compilation target, identifying a CUDA compute capability together with the -subtarget feature set selected by the suffix on its `.target` directive. +subtarget feature set selected by the suffix on its `.target` directive. Printed and +parsed in NVIDIA's compact form -- `sm"90"` for compute capability 9.0, `sm"103a"` +for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]` +notation in the PTX ISA reference and to distinguish visually from a device-level +[`VersionNumber`](@ref) like `v"9.0"`. `feature_set` is one of: @@ -31,12 +35,12 @@ See also [`@sm_str`](@ref) for an ergonomic string-macro constructor. # Examples ```julia julia> SMVersion(9, 0) # baseline -sm"9.0" +sm"90" julia> SMVersion(9, 0, :arch) -sm"9.0a" +sm"90a" -julia> sm"10.0f" == SMVersion(10, 0, :family) +julia> sm"100f" == SMVersion(10, 0, :family) true ``` """ @@ -53,8 +57,11 @@ struct SMVersion end function Base.parse(::Type{SMVersion}, s::AbstractString) - m = match(r"^(\d+)\.(\d+)([af]?)$", s) - m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"10.3\", \"10.3a\", or \"10.0f\"") + # Mirrors NVIDIA's `sm_NN[a|f]` notation: the last digit before the optional suffix + # is the minor, everything before it is the major. Always one minor digit (NVIDIA + # has never minted a CC with minor >= 10, and rolls the major over instead). + m = match(r"^(\d+)(\d)([af]?)$", s) + m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"103a\", or \"100f\"") major = parse(Int, m.captures[1]) minor = parse(Int, m.captures[2]) fs = m.captures[3] == "a" ? :arch : @@ -90,21 +97,22 @@ function runs_on(sm::SMVersion, dev_cap::VersionNumber) end -Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, ".", sm.minor, suffix(sm), "\"") +Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, sm.minor, suffix(sm), "\"") """ @sm_str -String macro used to parse a string to an [`SMVersion`](@ref). Accepts the dotted form -used in NVIDIA's PTX ISA reference: `sm"9.0"` for baseline, `sm"9.0a"` for -architecture-accelerated, `sm"10.0f"` for family-specific. +String macro used to parse a string to an [`SMVersion`](@ref). Accepts NVIDIA's +compact `sm_NN[a|f]` notation (without the `sm_` prefix): `sm"90"` for baseline, +`sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific. The last +digit before the optional suffix is the minor; everything before it is the major. # Examples ```julia -julia> sm"10.3a" -sm"10.3a" +julia> sm"103a" +sm"103a" -julia> sm"10.0f" == SMVersion(10, 0, :family) +julia> sm"100f" == SMVersion(10, 0, :family) true ``` """ diff --git a/test/core/codegen.jl b/test/core/codegen.jl index d818741d2c..de07b39ce9 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -255,36 +255,36 @@ end @test !success(run_ptxas(asm_pre, "sm_75")) - asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0") + asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90") @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). - asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"9.0a") + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90a") @test occursin(".target sm_90a", asm_arch) @test success(run_ptxas(asm_arch, "sm_90a")) # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line. - asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"10.0f") + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"100f") @test occursin(".target sm_100f", asm_family) @test success(run_ptxas(asm_family, "sm_100f")) end @testset "SMVersion and sm\"...\" macro" begin - @test sm"9.0" == SMVersion(9, 0, :baseline) - @test sm"9.0a" == SMVersion(9, 0, :arch) - @test sm"10.0f" == SMVersion(10, 0, :family) + @test sm"90" == SMVersion(9, 0, :baseline) + @test sm"90a" == SMVersion(9, 0, :arch) + @test sm"100f" == SMVersion(10, 0, :family) # printing roundtrips via the macro form - @test sprint(show, sm"10.3a") == "sm\"10.3a\"" - @test sprint(show, sm"10.0") == "sm\"10.0\"" + @test sprint(show, sm"103a") == "sm\"103a\"" + @test sprint(show, sm"100") == "sm\"100\"" # cpu_name reflects feature_set - @test CUDACore.cpu_name(sm"9.0") == "sm_90" - @test CUDACore.cpu_name(sm"9.0a") == "sm_90a" - @test CUDACore.cpu_name(sm"10.0f") == "sm_100f" + @test CUDACore.cpu_name(sm"90") == "sm_90" + @test CUDACore.cpu_name(sm"90a") == "sm_90a" + @test CUDACore.cpu_name(sm"100f") == "sm_100f" # base_version drops the suffix back to a comparable VersionNumber - @test CUDACore.base_version(sm"10.3a") == v"10.3" + @test CUDACore.base_version(sm"103a") == v"10.3" # constructor rejects bogus feature_set @test_throws ErrorException SMVersion(9, 0, :bogus) # macro rejects malformed strings @@ -296,33 +296,33 @@ end @testset "CUDACompilerParams hash discriminates on feature_set" begin # Without feature_set in the hash, two params differing only on feature_set would collide # in the compiler cache and silently return a cubin compiled for the wrong feature set. - base = CUDACore.CUDACompilerParams(sm=sm"9.0", ptx=v"8.0") - arch = CUDACore.CUDACompilerParams(sm=sm"9.0a", ptx=v"8.0") + base = CUDACore.CUDACompilerParams(sm=sm"90", ptx=v"8.0") + arch = CUDACore.CUDACompilerParams(sm=sm"90a", ptx=v"8.0") @test hash(base) != hash(arch) @test base != arch end @testset "ptx_cap_support" begin - # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"9.0") and PTX >= 8.0. - # Family-specific needs CC >= 10.0 (no `*f` keys below sm"10.0") and PTX >= 8.8. - @test !(sm"8.6a" in CUDACore.ptx_cap_support(v"8.0")) # no `*a` below CC 9.0 - @test !(sm"9.0a" in CUDACore.ptx_cap_support(v"7.8")) # `*a` requires PTX >= 8.0 - @test !(sm"9.0f" in CUDACore.ptx_cap_support(v"8.0")) # no `*f` below CC 10.0 - @test !(sm"10.0f" in CUDACore.ptx_cap_support(v"8.7")) # `*f` requires PTX >= 8.8 - @test sm"9.0a" in CUDACore.ptx_cap_support(v"8.0") - @test sm"10.0f" in CUDACore.ptx_cap_support(v"8.8") - @test sm"5.0" in CUDACore.ptx_cap_support(v"6.2") + # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"90") and PTX >= 8.0. + # Family-specific needs CC >= 10.0 (no `*f` keys below sm"100") and PTX >= 8.8. + @test !(sm"86a" in CUDACore.ptx_cap_support(v"8.0")) # no `*a` below CC 9.0 + @test !(sm"90a" in CUDACore.ptx_cap_support(v"7.8")) # `*a` requires PTX >= 8.0 + @test !(sm"90f" in CUDACore.ptx_cap_support(v"8.0")) # no `*f` below CC 10.0 + @test !(sm"100f" in CUDACore.ptx_cap_support(v"8.7")) # `*f` requires PTX >= 8.8 + @test sm"90a" in CUDACore.ptx_cap_support(v"8.0") + @test sm"100f" in CUDACore.ptx_cap_support(v"8.8") + @test sm"50" in CUDACore.ptx_cap_support(v"6.2") end @testset "llvm_cap_support" begin # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td. - @test sm"10.3a" in CUDACore.llvm_cap_support(v"21") - @test sm"10.0f" in CUDACore.llvm_cap_support(v"21") - @test sm"9.0a" in CUDACore.llvm_cap_support(v"18") - @test !(sm"9.0a" in CUDACore.llvm_cap_support(v"17")) # sm_90a added in LLVM 18 - @test !(sm"10.3a" in CUDACore.llvm_cap_support(v"20")) # sm_103a added in LLVM 21 - @test !(sm"10.0f" in CUDACore.llvm_cap_support(v"20")) # sm_100f added in LLVM 21 - @test sm"7.0" in CUDACore.llvm_cap_support(v"15") + @test sm"103a" in CUDACore.llvm_cap_support(v"21") + @test sm"100f" in CUDACore.llvm_cap_support(v"21") + @test sm"90a" in CUDACore.llvm_cap_support(v"18") + @test !(sm"90a" in CUDACore.llvm_cap_support(v"17")) # sm_90a added in LLVM 18 + @test !(sm"103a" in CUDACore.llvm_cap_support(v"20")) # sm_103a added in LLVM 21 + @test !(sm"100f" in CUDACore.llvm_cap_support(v"20")) # sm_100f added in LLVM 21 + @test sm"70" in CUDACore.llvm_cap_support(v"15") end end diff --git a/test/core/execution.jl b/test/core/execution.jl index ddb871b936..a3e5530aa9 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -50,15 +50,15 @@ end @cuda threads=2 dummy() # sm_10 isn't supported by LLVM - @test_throws "not supported by LLVM" @cuda launch=false cap=sm"1.0" dummy() + @test_throws "not supported by LLVM" @cuda launch=false cap=sm"10" dummy() # sm_20 is, but not by any CUDA version we support - @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"2.0" dummy() + @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"20" dummy() # there isn't any capability other than the device's that's guaruanteed to work dev_cap = capability(device()) dev_sm = SMVersion(dev_cap.major, dev_cap.minor) @cuda launch=false cap=dev_sm dummy() # but we should be able to see it in the generated PTX code - asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"5.0")) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50")) @test contains(asm, ".target sm_50") # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the From 17bd4c8b2a72ac4ecab94be0ae2f62503867026d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 16:04:00 +0200 Subject: [PATCH 13/16] More cleanly separate device capability and codegen architecture. --- CUDACore/src/compatibility.jl | 19 ++++++++------ CUDACore/src/compiler/compilation.jl | 34 ++++++++++++++---------- CUDACore/src/compiler/execution.jl | 13 +++++----- CUDACore/src/device/runtime.jl | 2 +- CUDACore/src/precompile.jl | 6 ++--- test/core/codegen.jl | 39 ++++++++++++++-------------- test/core/execution.jl | 32 +++++++++++++---------- 7 files changed, 81 insertions(+), 64 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index f8190d39cf..0f512a0255 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -16,7 +16,7 @@ const highest = v"999" # exact CC; code compiled for sm_103a runs only on CC 10.3 devices. # # Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX -# require for them, is encoded directly in the keys of `ptx_cap_db` and `llvm_cap_db` +# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db` # below: an unsupported combination simply has no entry. @@ -150,7 +150,7 @@ end # Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants # were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8. -const ptx_cap_db = Dict{SMVersion, VersionRange}( +const ptx_sm_db = Dict{SMVersion, VersionRange}( sm"10" => between(v"1.0", highest), sm"11" => between(v"1.0", highest), sm"12" => between(v"1.2", highest), @@ -193,9 +193,9 @@ const ptx_cap_db = Dict{SMVersion, VersionRange}( ) # Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`. -function ptx_cap_support(ver::VersionNumber) +function ptx_sm_support(ver::VersionNumber) caps = Set{SMVersion}() - for (cap, r) in ptx_cap_db + for (cap, r) in ptx_sm_db if ver in r push!(caps, cap) end @@ -209,7 +209,7 @@ end # Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up # here as a separate entry; without an entry LLVM does not know the variant CPU name and # constructing a TargetMachine with it would fall back to a generic subtarget. -const llvm_cap_db = Dict{SMVersion, VersionRange}( +const llvm_sm_db = Dict{SMVersion, VersionRange}( sm"20" => between(v"3.2", highest), sm"21" => between(v"3.2", highest), sm"30" => between(v"3.2", highest), @@ -249,9 +249,9 @@ const llvm_cap_db = Dict{SMVersion, VersionRange}( ) # Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`. -function llvm_cap_support(ver::VersionNumber) +function llvm_sm_support(ver::VersionNumber) caps = Set{SMVersion}() - for (cap, r) in llvm_cap_db + for (cap, r) in llvm_sm_db if ver in r push!(caps, cap) end @@ -315,7 +315,10 @@ end function llvm_compat(version=LLVM.version()) LLVM.InitializeNVPTXTarget() - return (cap=llvm_cap_support(version), + # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`. + # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level + # support is per-CC -- the names track the value type. + return (sm=llvm_sm_support(version), ptx=llvm_ptx_support(version)) end diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 32ff9f3375..dc296a1ca1 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -186,13 +186,21 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kwargs...) - # convert / deprecate VersionNumber cap - if cap isa VersionNumber - Base.depwarn("Passing a VersionNumber to `cap=` is deprecated; use the `sm\"$(cap.major).$(cap.minor)\"` string macro instead.", + arch=nothing, cap=nothing, ptx=nothing, kwargs...) + # `cap=` is the deprecated old name for `arch=` (matches nvcc/ptxas `-arch`). + if cap !== nothing + arch === nothing || + throw(ArgumentError("pass either `arch=` or the deprecated `cap=`, not both")) + Base.depwarn("the `cap=` kwarg is deprecated; use `arch=` (matching nvcc/ptxas `-arch`) instead.", :cufunction) - cap = SMVersion(cap.major, cap.minor, :baseline) + arch = cap + end + # `arch=` accepts a plain `VersionNumber` (treated as baseline) or an `SMVersion`. + if arch isa VersionNumber + arch = SMVersion(arch.major, arch.minor, :baseline) end + arch === nothing || arch isa SMVersion || + throw(ArgumentError("`arch=` must be a VersionNumber, SMVersion, or nothing; got $(typeof(arch))")) # inspect the toolchain llvm_support = llvm_compat() @@ -226,29 +234,29 @@ end # determine the compute capability to use. ## ptxas - ptx_caps = ptx_cap_support(ptxas_ptx) - if cap !== nothing + ptx_sms = ptx_sm_support(ptxas_ptx) + if arch !== nothing # explicit request: take it as-is, validating against the PTX ISA - cap in ptx_caps || - error("$(cpu_name(cap)) is not supported by PTX ISA $(ptxas_ptx)") - ptxas_sm = cap + arch in ptx_sms || + error("$(cpu_name(arch)) is not supported by PTX ISA $(ptxas_ptx)") + ptxas_sm = arch else # pick the most specific capability the selected PTX ISA supports whose cubin # would actually load on the current device. For baseline that's the onion model; # `:arch` requires an exact CC match, `:family` a same-family match. - ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_caps) + ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_sms) isempty(ptxas_candidates) && error("Compute capability $(capability(dev)) is not supported by ptxas " * "$(compiler_version()) at PTX ISA $(ptxas_ptx)") ptxas_sm = argmax(sm_key, ptxas_candidates) end ## LLVM - if ptxas_sm in llvm_support.cap + if ptxas_sm in llvm_support.sm llvm_sm = ptxas_sm else # Exact `ptxas_sm` unavailable in LLVM. Fall back to baseline LLVM at a # lower base, since arch/family features don't carry across versions. - baseline_candidates = filter(llvm_support.cap) do sm + baseline_candidates = filter(llvm_support.sm) do sm sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm) end isempty(baseline_candidates) && diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index f42de56402..bfb7d963ce 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} = ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :arch, :cap, :ptx] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream] @@ -433,11 +433,12 @@ The following keyword arguments are supported: - `name`: override the name that the kernel will have in the generated code - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals -- `cap` and `ptx`: to override the compute capability and PTX version to compile for. - `cap` accepts an [`SMVersion`](@ref) via the `sm"..."` string macro, e.g. - `cap=sm"103a"` for architecture-accelerated codegen on CC 10.3, or `cap=sm"100f"` - for family-portable Blackwell codegen. The bare form `cap=sm"103"` selects baseline - (forward-compatible) codegen. Passing a `VersionNumber` is deprecated. +- `arch` and `ptx`: override the GPU architecture (matching nvcc/ptxas `-arch`) and the + PTX ISA version to compile for. `arch` accepts either an [`SMVersion`](@ref) via the + `sm"..."` string macro (e.g. `arch=sm"103a"` for architecture-accelerated codegen on + CC 10.3, or `arch=sm"100f"` for family-portable Blackwell codegen) or a `VersionNumber` + (e.g. `arch=v"10.3"`, treated as baseline / forward-compatible). The old kwarg name + `cap=` is accepted as a deprecated alias. The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl index 93e0cd90c5..6739d27d68 100644 --- a/CUDACore/src/device/runtime.jl +++ b/CUDACore/src/device/runtime.jl @@ -14,7 +14,7 @@ function precompile_runtime() # `.cap` is now keyed by `SMVersion` and includes variants; runtime caches are # feature_set-agnostic, so we only warm the baseline entries. - sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().cap) + sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().sm) ptx = maximum(llvm_compat().ptx) JuliaContext() do ctx for sm in sms, debuginfo in [false, true] diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl index a5c39b1cd8..8817aa74cc 100644 --- a/CUDACore/src/precompile.jl +++ b/CUDACore/src/precompile.jl @@ -13,12 +13,12 @@ if :NVPTX in LLVM.backends() end llvm_support = llvm_compat() - # `.cap` is keyed by `SMVersion` and includes variants; pick the highest - # baseline cap <= v"7.5" for a portable precompile artifact. + # `.sm` is `Set{SMVersion}` (with variants); pick the highest baseline + # entry <= v"7.5" for a portable precompile artifact. llvm_sm = argmax(base_version, filter(sm -> sm.feature_set === :baseline && base_version(sm) <= v"7.5", - llvm_support.cap)) + llvm_support.sm)) llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx)) target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, debuginfo=true) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index de07b39ce9..30e472cd08 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -288,9 +288,10 @@ end # constructor rejects bogus feature_set @test_throws ErrorException SMVersion(9, 0, :bogus) # macro rejects malformed strings - @test_throws ErrorException parse(SMVersion, "103a") # missing dot - @test_throws ErrorException parse(SMVersion, "10.3x") # unknown suffix - @test_throws ErrorException parse(SMVersion, "10") # missing minor + @test_throws ErrorException parse(SMVersion, "10.3a") # dotted form (NVIDIA uses dotless) + @test_throws ErrorException parse(SMVersion, "100x") # unknown suffix + @test_throws ErrorException parse(SMVersion, "1") # only one digit (need at least major + minor) + @test_throws ErrorException parse(SMVersion, "") # empty end @testset "CUDACompilerParams hash discriminates on feature_set" begin @@ -302,27 +303,27 @@ end @test base != arch end -@testset "ptx_cap_support" begin +@testset "ptx_sm_support" begin # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"90") and PTX >= 8.0. # Family-specific needs CC >= 10.0 (no `*f` keys below sm"100") and PTX >= 8.8. - @test !(sm"86a" in CUDACore.ptx_cap_support(v"8.0")) # no `*a` below CC 9.0 - @test !(sm"90a" in CUDACore.ptx_cap_support(v"7.8")) # `*a` requires PTX >= 8.0 - @test !(sm"90f" in CUDACore.ptx_cap_support(v"8.0")) # no `*f` below CC 10.0 - @test !(sm"100f" in CUDACore.ptx_cap_support(v"8.7")) # `*f` requires PTX >= 8.8 - @test sm"90a" in CUDACore.ptx_cap_support(v"8.0") - @test sm"100f" in CUDACore.ptx_cap_support(v"8.8") - @test sm"50" in CUDACore.ptx_cap_support(v"6.2") + @test !(sm"86a" in CUDACore.ptx_sm_support(v"8.0")) # no `*a` below CC 9.0 + @test !(sm"90a" in CUDACore.ptx_sm_support(v"7.8")) # `*a` requires PTX >= 8.0 + @test !(sm"90f" in CUDACore.ptx_sm_support(v"8.0")) # no `*f` below CC 10.0 + @test !(sm"100f" in CUDACore.ptx_sm_support(v"8.7")) # `*f` requires PTX >= 8.8 + @test sm"90a" in CUDACore.ptx_sm_support(v"8.0") + @test sm"100f" in CUDACore.ptx_sm_support(v"8.8") + @test sm"50" in CUDACore.ptx_sm_support(v"6.2") end -@testset "llvm_cap_support" begin +@testset "llvm_sm_support" begin # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td. - @test sm"103a" in CUDACore.llvm_cap_support(v"21") - @test sm"100f" in CUDACore.llvm_cap_support(v"21") - @test sm"90a" in CUDACore.llvm_cap_support(v"18") - @test !(sm"90a" in CUDACore.llvm_cap_support(v"17")) # sm_90a added in LLVM 18 - @test !(sm"103a" in CUDACore.llvm_cap_support(v"20")) # sm_103a added in LLVM 21 - @test !(sm"100f" in CUDACore.llvm_cap_support(v"20")) # sm_100f added in LLVM 21 - @test sm"70" in CUDACore.llvm_cap_support(v"15") + @test sm"103a" in CUDACore.llvm_sm_support(v"21") + @test sm"100f" in CUDACore.llvm_sm_support(v"21") + @test sm"90a" in CUDACore.llvm_sm_support(v"18") + @test !(sm"90a" in CUDACore.llvm_sm_support(v"17")) # sm_90a added in LLVM 18 + @test !(sm"103a" in CUDACore.llvm_sm_support(v"20")) # sm_103a added in LLVM 21 + @test !(sm"100f" in CUDACore.llvm_sm_support(v"20")) # sm_100f added in LLVM 21 + @test sm"70" in CUDACore.llvm_sm_support(v"15") end end diff --git a/test/core/execution.jl b/test/core/execution.jl index a3e5530aa9..387af3547e 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -50,15 +50,20 @@ end @cuda threads=2 dummy() # sm_10 isn't supported by LLVM - @test_throws "not supported by LLVM" @cuda launch=false cap=sm"10" dummy() + @test_throws "not supported by LLVM" @cuda launch=false arch=sm"10" dummy() # sm_20 is, but not by any CUDA version we support - @test_throws "Failed to compile PTX code" @cuda launch=false cap=sm"20" dummy() + @test_throws "Failed to compile PTX code" @cuda launch=false arch=sm"20" dummy() # there isn't any capability other than the device's that's guaruanteed to work dev_cap = capability(device()) dev_sm = SMVersion(dev_cap.major, dev_cap.minor) - @cuda launch=false cap=dev_sm dummy() + @cuda launch=false arch=dev_sm dummy() + # `arch=` also accepts a plain `VersionNumber` -- treated as baseline. Equivalent + # to constructing the SMVersion directly. + @cuda launch=false arch=dev_cap dummy() # but we should be able to see it in the generated PTX code - asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50")) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50")) + @test contains(asm, ".target sm_50") + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0")) @test contains(asm, ".target sm_50") # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the @@ -79,23 +84,22 @@ end sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) if dev_cap >= v"9.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_a)) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a)) @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") # arch-specific cubin should also actually launch on the matching device - @cuda cap=sm_a dummy() + @cuda arch=sm_a dummy() end if dev_cap >= v"10.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm_f)) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f)) @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))") - @cuda cap=sm_f dummy() + @cuda arch=sm_f dummy() end - # passing a VersionNumber to `cap` is deprecated; check the depwarn fires while - # the path still produces the right PTX. (Uses code_ptx to skip ptxas, which on - # newer CUDA toolkits no longer accepts sm_50.) - @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0")) + # `cap=` is the deprecated alias for `arch=`; check the depwarn fires while + # the path still produces the right PTX. + @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50")) - # With no explicit `cap=`, we default to architecture-specific code paths on CC >=9.0 + # With no explicit `arch=`, we default to architecture-specific code paths on CC >=9.0 # since we know the exact device. The cuda-side `.target` is the variant regardless of # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs. if dev_cap >= v"9.0" @@ -121,7 +125,7 @@ end @cuda threads=1 read_feature_set!(out) # arch features come through `target_feature_set()` only when LLVM natively supported # the variant; otherwise we fell back to baseline LLVM and the global reflects that. - arch_in_llvm = sm_a in CUDACore.llvm_cap_support(CUDACore.LLVM.version()) + arch_in_llvm = sm_a in CUDACore.llvm_sm_support(CUDACore.LLVM.version()) expected = dev_cap >= v"9.0" && arch_in_llvm ? UInt32(2) : UInt32(0) @test Array(out)[1] == expected end From aa9ae2e98987612898a3704ab10bce604ae4ef28 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 16:12:42 +0200 Subject: [PATCH 14/16] Also support string-based version numbers with sm_ prefixes. --- CUDACore/src/compiler/compilation.jl | 9 ++---- CUDACore/src/compiler/sm.jl | 41 ++++++++++++++++++++++------ test/core/codegen.jl | 8 ++++++ 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index dc296a1ca1..bee91fbca1 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -195,12 +195,9 @@ end :cufunction) arch = cap end - # `arch=` accepts a plain `VersionNumber` (treated as baseline) or an `SMVersion`. - if arch isa VersionNumber - arch = SMVersion(arch.major, arch.minor, :baseline) - end - arch === nothing || arch isa SMVersion || - throw(ArgumentError("`arch=` must be a VersionNumber, SMVersion, or nothing; got $(typeof(arch))")) + # `SMVersion` is the universal normalizer: identity for an SMVersion, baseline-promotes + # a VersionNumber, parses a string. Anything else falls out as a MethodError naturally. + arch === nothing || (arch = SMVersion(arch)) # inspect the toolchain llvm_support = llvm_compat() diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl index 1b3cdb9734..420e280eea 100644 --- a/CUDACore/src/compiler/sm.jl +++ b/CUDACore/src/compiler/sm.jl @@ -2,6 +2,9 @@ export SMVersion, @sm_str """ SMVersion(major, minor, [feature_set]) + SMVersion(s::AbstractString) + SMVersion(v::VersionNumber) + SMVersion(sm::SMVersion) A PTX compilation target, identifying a CUDA compute capability together with the subtarget feature set selected by the suffix on its `.target` directive. Printed and @@ -10,6 +13,17 @@ for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]` notation in the PTX ISA reference and to distinguish visually from a device-level [`VersionNumber`](@ref) like `v"9.0"`. +The single-argument constructors normalize various inputs to an `SMVersion`: + +- `SMVersion(::AbstractString)` parses the compact form, with or without the `sm_` + prefix (so e.g. `SMVersion("sm_103a")` and `SMVersion("103a")` both work). +- `SMVersion(::VersionNumber)` promotes a plain compute-capability version to a + baseline `SMVersion` (`SMVersion(v"10.3") == SMVersion(10, 3, :baseline)`). +- `SMVersion(::SMVersion)` is the identity (idempotent). + +This is what lets `@cuda arch=...` accept `v"10.3"`, `sm"103a"`, `"sm_103a"`, or +an already-constructed `SMVersion` interchangeably. + `feature_set` is one of: - `:baseline` (no suffix, e.g. `sm_90`) — forward-compatible (the "onion model"): @@ -59,9 +73,12 @@ end function Base.parse(::Type{SMVersion}, s::AbstractString) # Mirrors NVIDIA's `sm_NN[a|f]` notation: the last digit before the optional suffix # is the minor, everything before it is the major. Always one minor digit (NVIDIA - # has never minted a CC with minor >= 10, and rolls the major over instead). - m = match(r"^(\d+)(\d)([af]?)$", s) - m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"103a\", or \"100f\"") + # has never minted a CC with minor >= 10, and rolls the major over instead). The + # optional `sm_` prefix is accepted so PTX-tool output / config strings can pass + # straight through. + raw = startswith(s, "sm_") ? SubString(s, 4) : s + m = match(r"^(\d+)(\d)([af]?)$", raw) + m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"sm_103a\", or \"100f\"") major = parse(Int, m.captures[1]) minor = parse(Int, m.captures[2]) fs = m.captures[3] == "a" ? :arch : @@ -69,6 +86,13 @@ function Base.parse(::Type{SMVersion}, s::AbstractString) return SMVersion(major, minor, fs) end +# Single-argument constructor: the universal normalizer for accepting an `arch`/`cap`-like +# argument. Identity for SMVersion; baseline-promotes a plain VersionNumber; parses a +# string (with or without the `sm_` prefix). +SMVersion(sm::SMVersion) = sm +SMVersion(v::VersionNumber) = SMVersion(v.major, v.minor, :baseline) +SMVersion(s::AbstractString) = Base.parse(SMVersion, s) + # Suffix on the LLVM CPU name / `.target` directive suffix(sm::SMVersion) = sm.feature_set === :arch ? "a" : sm.feature_set === :family ? "f" : "" @@ -103,9 +127,10 @@ Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, sm.minor, suffix( @sm_str String macro used to parse a string to an [`SMVersion`](@ref). Accepts NVIDIA's -compact `sm_NN[a|f]` notation (without the `sm_` prefix): `sm"90"` for baseline, -`sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific. The last -digit before the optional suffix is the minor; everything before it is the major. +compact `sm_NN[a|f]` notation (with or without the `sm_` prefix): `sm"90"` for +baseline, `sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific. +Equivalent to calling `SMVersion(str)`; parses at macro-expansion time, so the +resulting `SMVersion` is a compile-time constant in the surrounding expression. # Examples ```julia @@ -116,6 +141,4 @@ julia> sm"100f" == SMVersion(10, 0, :family) true ``` """ -macro sm_str(s) - return :(Base.parse($SMVersion, $(esc(s)))) -end +macro sm_str(s); SMVersion(s); end diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 30e472cd08..c6acc1106f 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -292,6 +292,14 @@ end @test_throws ErrorException parse(SMVersion, "100x") # unknown suffix @test_throws ErrorException parse(SMVersion, "1") # only one digit (need at least major + minor) @test_throws ErrorException parse(SMVersion, "") # empty + + # `SMVersion(x)` as the universal normalizer: + @test SMVersion(sm"103a") === sm"103a" # identity + @test SMVersion(v"10.3") == SMVersion(10, 3, :baseline) # VersionNumber → baseline + @test SMVersion("103a") == sm"103a" # bare string + @test SMVersion("sm_103a") == sm"103a" # accepts NVIDIA prefix + # the macro is just a parse-time call to the constructor + @test sm"103a" == SMVersion("103a") end @testset "CUDACompilerParams hash discriminates on feature_set" begin From 1cedc6393db040edc3977643bc007d0a9e5cb98c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 16:19:02 +0200 Subject: [PATCH 15/16] Drop overly specific tests. --- test/core/codegen.jl | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index c6acc1106f..db431db4ed 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -302,38 +302,6 @@ end @test sm"103a" == SMVersion("103a") end -@testset "CUDACompilerParams hash discriminates on feature_set" begin - # Without feature_set in the hash, two params differing only on feature_set would collide - # in the compiler cache and silently return a cubin compiled for the wrong feature set. - base = CUDACore.CUDACompilerParams(sm=sm"90", ptx=v"8.0") - arch = CUDACore.CUDACompilerParams(sm=sm"90a", ptx=v"8.0") - @test hash(base) != hash(arch) - @test base != arch -end - -@testset "ptx_sm_support" begin - # Architecture-specific needs CC >= 9.0 (i.e. no `*a` keys below sm"90") and PTX >= 8.0. - # Family-specific needs CC >= 10.0 (no `*f` keys below sm"100") and PTX >= 8.8. - @test !(sm"86a" in CUDACore.ptx_sm_support(v"8.0")) # no `*a` below CC 9.0 - @test !(sm"90a" in CUDACore.ptx_sm_support(v"7.8")) # `*a` requires PTX >= 8.0 - @test !(sm"90f" in CUDACore.ptx_sm_support(v"8.0")) # no `*f` below CC 10.0 - @test !(sm"100f" in CUDACore.ptx_sm_support(v"8.7")) # `*f` requires PTX >= 8.8 - @test sm"90a" in CUDACore.ptx_sm_support(v"8.0") - @test sm"100f" in CUDACore.ptx_sm_support(v"8.8") - @test sm"50" in CUDACore.ptx_sm_support(v"6.2") -end - -@testset "llvm_sm_support" begin - # Floors come from `def : Proc<"sm_NNa", ...>` etc. in NVPTX.td. - @test sm"103a" in CUDACore.llvm_sm_support(v"21") - @test sm"100f" in CUDACore.llvm_sm_support(v"21") - @test sm"90a" in CUDACore.llvm_sm_support(v"18") - @test !(sm"90a" in CUDACore.llvm_sm_support(v"17")) # sm_90a added in LLVM 18 - @test !(sm"103a" in CUDACore.llvm_sm_support(v"20")) # sm_103a added in LLVM 21 - @test !(sm"100f" in CUDACore.llvm_sm_support(v"20")) # sm_100f added in LLVM 21 - @test sm"70" in CUDACore.llvm_sm_support(v"15") -end - end ############################################################################################ From cf5b4b6ea878909de617966bfd33c45d2885b51f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 22:02:46 +0200 Subject: [PATCH 16/16] Fix docs. [only docs] --- CUDACore/src/compiler/sm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl index 420e280eea..86f566766a 100644 --- a/CUDACore/src/compiler/sm.jl +++ b/CUDACore/src/compiler/sm.jl @@ -11,7 +11,7 @@ subtarget feature set selected by the suffix on its `.target` directive. Printed parsed in NVIDIA's compact form -- `sm"90"` for compute capability 9.0, `sm"103a"` for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]` notation in the PTX ISA reference and to distinguish visually from a device-level -[`VersionNumber`](@ref) like `v"9.0"`. +`VersionNumber` like `v"9.0"`. The single-argument constructors normalize various inputs to an `SMVersion`: