diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index bab293aba3..2c9992781e 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.10" +GPUCompiler = "1.12" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" diff --git a/CUDACore/lib/cudadrv/state.jl b/CUDACore/lib/cudadrv/state.jl index 6c03564bb8..32159136fa 100644 --- a/CUDACore/lib/cudadrv/state.jl +++ b/CUDACore/lib/cudadrv/state.jl @@ -227,7 +227,7 @@ function context(dev::CuDevice) maxlog=1, _id=devidx) end # ... or too new - if !in(capability(dev), cuda_compat().cap) + if !in(capability(dev), ptxas_compat().cap) @warn("""Your $(name(dev)) GPU (compute capability $(capability(dev).major).$(capability(dev).minor)) is not fully supported by CUDA $(runtime_version()). Some functionality may be broken. Ensure you are using the latest version of CUDA.jl in combination with an up-to-date NVIDIA driver. If that does not help, please file an issue to add support for the latest CUDA toolkit.""", diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl index fa2bfa2b45..c0332759ff 100644 --- a/CUDACore/src/CUDACore.jl +++ b/CUDACore/src/CUDACore.jl @@ -80,6 +80,7 @@ include("../lib/cudadrv/CUDAdrv.jl") # essential stuff include("initialization.jl") +include("compiler/sm.jl") include("compatibility.jl") include("debug.jl") diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index afb833b32a..0f512a0255 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -1,15 +1,25 @@ # compatibility of Julia, CUDA and LLVM -# NOTE: Target architectures with suffix “a”, such as sm_90a, include -# architecture-accelerated features that are supported on the specified architecture only, -# hence such targets do not follow the onion layer model. Therefore, PTX code generated for -# such targets cannot be run on later generation devices. Architecture-accelerated features -# can only be used with targets that support these features. - const lowest = v"0" const highest = v"999" +# PTX compilation targets come in three feature-set flavors (carried on `SMVersion`), +# selected via the suffix on the `.target` directive (and the matching `--gpu-name` +# to ptxas): +# +# - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled +# for sm_X runs on any sm_Y with Y >= X (onion model). +# - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable; +# code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families. +# - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one +# exact CC; code compiled for sm_103a runs only on CC 10.3 devices. +# +# Which feature sets exist for a given CC, and which PTX ISA / LLVM versions ptxas / NVPTX +# require for them, is encoded directly in the keys of `ptx_sm_db` and `llvm_sm_db` +# below: an unsupported combination simply has no entry. + + ## version range struct VersionRange @@ -26,12 +36,12 @@ Base.intersect(v::VersionNumber, r::VersionRange) = v > r.upper ? (v:r.upper) : (v:v) -## devices supported by the CUDA toolkit +## devices supported by ptxas # Source: # - https://en.wikipedia.org/wiki/CUDA#GPUs_supported # - ptxas |& grep -A 10 '\--gpu-name' -const cuda_cap_db = Dict( +const ptxas_cap_db = Dict( v"1.0" => between(lowest, v"6.5"), v"1.1" => between(lowest, v"6.5"), v"1.2" => between(lowest, v"6.5"), @@ -63,9 +73,9 @@ const cuda_cap_db = Dict( v"12.1" => between(v"12.9", highest), ) -function cuda_cap_support(ver::VersionNumber) +function ptxas_cap_support(ver::VersionNumber) caps = Set{VersionNumber}() - for (cap,r) in cuda_cap_db + for (cap,r) in ptxas_cap_db if ver in r push!(caps, cap) end @@ -74,10 +84,10 @@ function cuda_cap_support(ver::VersionNumber) end -## PTX ISAs supported by the CUDA toolkit +## PTX ISAs supported by ptxas # Source: PTX ISA document, Release History table -const cuda_ptx_db = Dict( +const ptxas_ptx_db = Dict( v"1.0" => between(v"1.0", highest), v"1.1" => between(v"1.1", highest), v"1.2" => between(v"2.0", highest), @@ -125,9 +135,9 @@ const cuda_ptx_db = Dict( v"9.2" => between(v"13.2", highest), ) -function cuda_ptx_support(ver::VersionNumber) +function ptxas_ptx_support(ver::VersionNumber) caps = Set{VersionNumber}() - for (cap,r) in cuda_ptx_db + for (cap,r) in ptxas_ptx_db if ver in r push!(caps, cap) end @@ -138,52 +148,54 @@ end ## devices supported by each PTX ISA -# Source: PTX ISA document, Release History table -const ptx_cap_db = Dict( - v"1.0" => between(v"1.0", highest), - v"1.1" => between(v"1.0", highest), - v"1.2" => between(v"1.2", highest), - v"1.3" => between(v"1.2", highest), - v"2.0" => between(v"2.0", highest), - v"3.0" => between(v"3.1", highest), - v"3.2" => between(v"4.0", highest), - v"3.5" => between(v"3.1", highest), - v"3.7" => between(v"4.1", highest), - v"5.0" => between(v"4.0", highest), - v"5.2" => between(v"4.1", highest), - v"5.3" => between(v"4.2", highest), - v"6.0" => between(v"5.0", highest), - v"6.1" => between(v"5.0", highest), - v"6.2" => between(v"5.0", highest), - v"7.0" => between(v"6.0", highest), - v"7.2" => between(v"6.1", highest), - v"7.5" => between(v"6.3", highest), - v"8.0" => between(v"7.0", highest), - v"8.6" => between(v"7.1", highest), - v"8.7" => between(v"7.4", highest), - v"8.9" => between(v"7.8", highest), - v"9.0" => between(v"7.8", highest), - #v"9.0a" => between(v"8.0", highest) - v"10.0" => between(v"8.6", highest), - #v"10.0a"=> between(v"8.6", highest), - #v"10.0f"=> between(v"8.8", highest), - v"10.1" => between(v"8.6", highest), - #v"10.1a"=> between(v"8.6", highest), - #v"10.1f"=> between(v"8.8", highest), - v"10.3" => between(v"8.8", highest), - #v"10.3a"=> between(v"8.8", highest), - #v"10.3f"=> between(v"8.8", highest), - v"12.0" => between(v"8.7", highest), - #v"12.0a"=> between(v"8.7", highest), - #v"12.0f"=> between(v"8.8", highest), - v"12.1" => between(v"8.8", highest), - #v"12.1a"=> between(v"8.8", highest), - #v"12.1f"=> between(v"8.8", highest), +# Source: PTX ISA document, Release History table. Architecture-specific (`*a`) variants +# were introduced at CC 9.0 / PTX 8.0; family-specific (`*f`) variants at CC 10.0 / PTX 8.8. +const ptx_sm_db = Dict{SMVersion, VersionRange}( + sm"10" => between(v"1.0", highest), + sm"11" => between(v"1.0", highest), + sm"12" => between(v"1.2", highest), + sm"13" => between(v"1.2", highest), + sm"20" => between(v"2.0", highest), + sm"30" => between(v"3.1", highest), + sm"32" => between(v"4.0", highest), + sm"35" => between(v"3.1", highest), + sm"37" => between(v"4.1", highest), + sm"50" => between(v"4.0", highest), + sm"52" => between(v"4.1", highest), + sm"53" => between(v"4.2", highest), + sm"60" => between(v"5.0", highest), + sm"61" => between(v"5.0", highest), + sm"62" => between(v"5.0", highest), + sm"70" => between(v"6.0", highest), + sm"72" => between(v"6.1", highest), + sm"75" => between(v"6.3", highest), + sm"80" => between(v"7.0", highest), + sm"86" => between(v"7.1", highest), + sm"87" => between(v"7.4", highest), + sm"89" => between(v"7.8", highest), + sm"90" => between(v"7.8", highest), + sm"90a" => between(v"8.0", highest), + sm"100" => between(v"8.6", highest), + sm"100a" => between(v"8.6", highest), + sm"100f" => between(v"8.8", highest), + sm"101" => between(v"8.6", highest), + sm"101a" => between(v"8.6", highest), + sm"101f" => between(v"8.8", highest), + sm"103" => between(v"8.8", highest), + sm"103a" => between(v"8.8", highest), + sm"103f" => between(v"8.8", highest), + sm"120" => between(v"8.7", highest), + sm"120a" => between(v"8.7", highest), + sm"120f" => between(v"8.8", highest), + sm"121" => between(v"8.8", highest), + sm"121a" => between(v"8.8", highest), + sm"121f" => between(v"8.8", highest), ) -function ptx_cap_support(ver::VersionNumber) - caps = Set{VersionNumber}() - for (cap,r) in ptx_cap_db +# Set of `SMVersion`s (across all feature sets) whose ptxas floor is met by `ver`. +function ptx_sm_support(ver::VersionNumber) + caps = Set{SMVersion}() + for (cap, r) in ptx_sm_db if ver in r push!(caps, cap) end @@ -194,44 +206,52 @@ end ## devices supported by the LLVM NVPTX back-end -# Source: LLVM/lib/Target/NVPTX/NVPTX.td -const llvm_cap_db = Dict( - v"2.0" => between(v"3.2", highest), - v"2.1" => between(v"3.2", highest), - v"3.0" => between(v"3.2", highest), - v"3.2" => between(v"3.7", highest), - v"3.5" => between(v"3.2", highest), - v"3.7" => between(v"3.7", highest), - v"5.0" => between(v"3.5", highest), - v"5.2" => between(v"3.7", highest), - v"5.3" => between(v"3.7", highest), - v"6.0" => between(v"3.9", highest), - v"6.1" => between(v"3.9", highest), - v"6.2" => between(v"3.9", highest), - v"7.0" => between(v"6", highest), - v"7.2" => between(v"7", highest), - v"7.5" => between(v"8", highest), - v"8.0" => between(v"11", highest), - v"8.6" => between(v"13", highest), - v"8.7" => between(v"16", highest), - v"8.9" => between(v"16", highest), - v"9.0" => between(v"16", highest), - #v"9.0a" => between(v"18", highest), - v"10.0" => between(v"20", highest), - #v"10.0a"=> between(v"20", highest), - v"10.1" => between(v"20", highest), - #v"10.1a"=> between(v"20", highest), - v"10.3" => between(v"21", highest), - #v"10.3a"=> between(v"21", highest), - v"12.0" => between(v"20", highest), - #v"12.0a"=> between(v"20", highest), - v"12.1" => between(v"21", highest), - #v"12.1a"=> between(v"21", highest), +# Source: LLVM/lib/Target/NVPTX/NVPTX.td. Each `def : Proc<"sm_NN[a|f]", ...>` shows up +# here as a separate entry; without an entry LLVM does not know the variant CPU name and +# constructing a TargetMachine with it would fall back to a generic subtarget. +const llvm_sm_db = Dict{SMVersion, VersionRange}( + sm"20" => between(v"3.2", highest), + sm"21" => between(v"3.2", highest), + sm"30" => between(v"3.2", highest), + sm"32" => between(v"3.7", highest), + sm"35" => between(v"3.2", highest), + sm"37" => between(v"3.7", highest), + sm"50" => between(v"3.5", highest), + sm"52" => between(v"3.7", highest), + sm"53" => between(v"3.7", highest), + sm"60" => between(v"3.9", highest), + sm"61" => between(v"3.9", highest), + sm"62" => between(v"3.9", highest), + sm"70" => between(v"6", highest), + sm"72" => between(v"7", highest), + sm"75" => between(v"8", highest), + sm"80" => between(v"11", highest), + sm"86" => between(v"13", highest), + sm"87" => between(v"16", highest), + sm"89" => between(v"16", highest), + sm"90" => between(v"16", highest), + sm"90a" => between(v"18", highest), + sm"100" => between(v"20", highest), + sm"100a" => between(v"20", highest), + sm"100f" => between(v"21", highest), + sm"101" => between(v"20", highest), + sm"101a" => between(v"20", highest), + sm"101f" => between(v"21", highest), + sm"103" => between(v"21", highest), + sm"103a" => between(v"21", highest), + sm"103f" => between(v"21", highest), + sm"120" => between(v"20", highest), + sm"120a" => between(v"20", highest), + sm"120f" => between(v"21", highest), + sm"121" => between(v"21", highest), + sm"121a" => between(v"21", highest), + sm"121f" => between(v"21", highest), ) -function llvm_cap_support(ver::VersionNumber) - caps = Set{VersionNumber}() - for (cap,r) in llvm_cap_db +# Set of `SMVersion`s (across all feature sets) supported by LLVM `ver`. +function llvm_sm_support(ver::VersionNumber) + caps = Set{SMVersion}() + for (cap, r) in llvm_sm_db if ver in r push!(caps, cap) end @@ -295,32 +315,14 @@ end function llvm_compat(version=LLVM.version()) LLVM.InitializeNVPTXTarget() - cap_support = sort(collect(llvm_cap_support(version))) - ptx_support = sort(collect(llvm_ptx_support(version))) - - return (cap=cap_support, ptx=ptx_support) -end - -function cuda_compat(runtime=runtime_version(), compiler=compiler_version()) - # we don't have to check the driver version, because it offers backwards compatbility - # beyond the CUDA toolkit version (e.g. R580 for CUDA 13 still supports Volta as - # deprecated in CUDA 13), and we don't have a reliable way to query the actual version - # as NVML isn't available on all platforms. let's instead simply assume that unsupported - # devices will not be exposed to the CUDA runtime and thus won't be visible to us. - - # the compiler and runtime are versioned independently (and either can come from a - # local install), so we need to consider both: - # - device caps are dropped when either ptxas can't emit for them or the runtime - # libraries drop them. take the intersection of both supported sets. - # - PTX ISA availability is a property of ptxas; the runtime doesn't care which ISA - # compiled cubin came from. - cap_support = sort(collect(intersect(cuda_cap_support(runtime), - cuda_cap_support(compiler)))) - ptx_support = sort(collect(cuda_ptx_support(compiler))) - - return (cap=cap_support, ptx=ptx_support) + # `.sm` is `Set{SMVersion}` (with variants); `.ptx` is `Set{VersionNumber}`. + # `ptxas_compat()` returns `.cap` as `Set{VersionNumber}` because ptxas-level + # support is per-CC -- the names track the value type. + return (sm=llvm_sm_support(version), + ptx=llvm_ptx_support(version)) end -function ptx_compat(ptx) - return (cap=ptx_cap_support(ptx),) +function ptxas_compat(version=compiler_version()) + return (cap=ptxas_cap_support(version), + ptx=ptxas_ptx_support(version)) end diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 1b60e42602..bee91fbca1 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -1,12 +1,12 @@ ## gpucompiler interface implementation Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams - cap::VersionNumber + sm::SMVersion ptx::VersionNumber end function Base.hash(params::CUDACompilerParams, h::UInt) - h = hash(params.cap, h) + h = hash(params.sm, h) h = hash(params.ptx, h) return h @@ -119,10 +119,10 @@ end # stamp `.version` with the ISA we want `ptxas` to validate against # and `.target` with the arch that `--gpu-name` will use -function rewrite_ptx_header(asm, ptx, cap) +function rewrite_ptx_header(asm, ptx::VersionNumber, sm::SMVersion) return replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)", - r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)") + r"\.target sm_\d+\w*" => ".target $(cpu_name(sm))") end function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format) @@ -145,9 +145,16 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module asm = replace(asm, r"(\.target .+), debug" => s"\1") end - (; ptx, cap) = job.config.params - if job.config.target.ptx != ptx || job.config.target.cap != cap - asm = rewrite_ptx_header(asm, ptx, cap) + # The rewrite stamps `.target`/`.version` with the *requested* (cuda-side) values. + # When the GPUCompiler-side target matches, LLVM already emits the right header + # (including the `a`/`f` suffix, via the CPU name); we only rewrite when they differ, + # e.g. when we had to clamp the target down for LLVM compatibility. + (; ptx, sm) = job.config.params + needs_rewrite = job.config.target.ptx != ptx || + job.config.target.cap != base_version(sm) || + job.config.target.feature_set !== sm.feature_set + if needs_rewrite + asm = rewrite_ptx_header(asm, ptx, sm) end return asm @@ -179,66 +186,86 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kwargs...) - # determine the toolchain + arch=nothing, cap=nothing, ptx=nothing, kwargs...) + # `cap=` is the deprecated old name for `arch=` (matches nvcc/ptxas `-arch`). + if cap !== nothing + arch === nothing || + throw(ArgumentError("pass either `arch=` or the deprecated `cap=`, not both")) + Base.depwarn("the `cap=` kwarg is deprecated; use `arch=` (matching nvcc/ptxas `-arch`) instead.", + :cufunction) + arch = cap + end + # `SMVersion` is the universal normalizer: identity for an SMVersion, baseline-promotes + # a VersionNumber, parses a string. Anything else falls out as a MethodError naturally. + arch === nothing || (arch = SMVersion(arch)) + + # inspect the toolchain llvm_support = llvm_compat() - cuda_support = cuda_compat() + ptxas_support = ptxas_compat() - # determine the PTX ISA to use. we want at least 6.2, but will use newer if possible. - requested_ptx = something(ptx, v"6.2") - llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx) - cuda_ptxs = filter(>=(requested_ptx), cuda_support.ptx) + # determine the PTX ISA to use. if ptx !== nothing - # the user requested a specific PTX ISA - ## use the highest ISA supported by LLVM - isempty(llvm_ptxs) && + # explicit request: take it exactly, validating against the toolchain + ptx in llvm_support.ptx || error("Requested PTX ISA $ptx is not supported by LLVM $(LLVM.version())") - llvm_ptx = maximum(llvm_ptxs) - ## use the ISA as-is to invoke CUDA - cuda_ptx = ptx + ptx in ptxas_support.ptx || + error("Requested PTX ISA $ptx is not supported by ptxas $(compiler_version())") + llvm_ptx = ptxas_ptx = ptx else - # try to do the best thing (i.e., use the newest PTX ISA) - # XXX: is it safe to just use the latest PTX ISA? isn't it possible for, e.g., - # instructions to get deprecated? + # default: pick the newest PTX ISA supported by the toolchain (>=v6.2) + requested_ptx = v"6.2" + llvm_ptxs = filter(>=(requested_ptx), llvm_support.ptx) + ptxas_ptxs = filter(>=(requested_ptx), ptxas_support.ptx) isempty(llvm_ptxs) && error("CUDA.jl requires PTX $requested_ptx, which is not supported by LLVM $(LLVM.version())") - llvm_ptx = maximum(llvm_ptxs) - isempty(cuda_ptxs) && - error("CUDA.jl requires PTX $requested_ptx, which is not supported by CUDA $(compiler_version())") - cuda_ptx = maximum(cuda_ptxs) + isempty(ptxas_ptxs) && + error("CUDA.jl requires PTX $requested_ptx, which is not supported by ptxas $(compiler_version())") + ptxas_ptx = maximum(ptxas_ptxs) + llvm_ptx = min(maximum(llvm_ptxs), ptxas_ptx) end - # determine the compute capabilities to use. this should match the capability of the - # current device, but if LLVM doesn't support it, we can target an older capability - # and pass a different `-arch` to `ptxas`. - ptx_support = ptx_compat(cuda_ptx) - requested_cap = @something(cap, min(capability(dev), maximum(ptx_support.cap))) - llvm_caps = filter(<=(requested_cap), llvm_support.cap) - if cap !== nothing - ## use the highest capability supported by LLVM - isempty(llvm_caps) && - error("Requested compute capability $cap is not supported by LLVM $(LLVM.version())") - llvm_cap = maximum(llvm_caps) - ## use the capability as-is to invoke CUDA - cuda_cap = cap + # when selecting compute capabilities, we prefer the most recent one, as + # well as prefer to use architecture-accelerated features when available. + fs_rank(fs::Symbol) = fs === :arch ? 2 : fs === :family ? 1 : 0 + sm_key(sm::SMVersion) = (base_version(sm), fs_rank(sm.feature_set)) + + # determine the compute capability to use. + ## ptxas + ptx_sms = ptx_sm_support(ptxas_ptx) + if arch !== nothing + # explicit request: take it as-is, validating against the PTX ISA + arch in ptx_sms || + error("$(cpu_name(arch)) is not supported by PTX ISA $(ptxas_ptx)") + ptxas_sm = arch else - ## use the highest capability supported by LLVM - isempty(llvm_caps) && - error("Compute capability $(requested_cap) is not supported by LLVM $(LLVM.version())") - llvm_cap = maximum(llvm_caps) - ## use the highest capability supported by CUDA - cuda_caps = filter(<=(capability(dev)), cuda_support.cap) - isempty(cuda_caps) && - error("Compute capability $(requested_cap) is not supported by CUDA $(runtime_version())") - cuda_cap = maximum(cuda_caps) + # pick the most specific capability the selected PTX ISA supports whose cubin + # would actually load on the current device. For baseline that's the onion model; + # `:arch` requires an exact CC match, `:family` a same-family match. + ptxas_candidates = filter(sm -> runs_on(sm, capability(dev)), ptx_sms) + isempty(ptxas_candidates) && + error("Compute capability $(capability(dev)) is not supported by ptxas " * + "$(compiler_version()) at PTX ISA $(ptxas_ptx)") + ptxas_sm = argmax(sm_key, ptxas_candidates) + end + ## LLVM + if ptxas_sm in llvm_support.sm + llvm_sm = ptxas_sm + else + # Exact `ptxas_sm` unavailable in LLVM. Fall back to baseline LLVM at a + # lower base, since arch/family features don't carry across versions. + baseline_candidates = filter(llvm_support.sm) do sm + sm.feature_set === :baseline && base_version(sm) <= base_version(ptxas_sm) + end + isempty(baseline_candidates) && + error("Compute capability $(cpu_name(ptxas_sm)) is not supported by LLVM $(LLVM.version())") + llvm_sm = argmax(sm_key, baseline_candidates) end - - # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 - debuginfo = compiler_version() >= v"11.7" # create GPUCompiler objects - target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) - params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx) + target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, + feature_set=llvm_sm.feature_set, + debuginfo=true, kwargs...) + params = CUDACompilerParams(; sm=ptxas_sm, ptx=ptxas_ptx) CompilerConfig(target, params; kernel, name, always_inline) end @@ -273,9 +300,9 @@ function compile(@nospecialize(job::CompilerJob)) push!(ptxas_opts, "--compile-only") end - ptx = job.config.params.ptx - cap = job.config.params.cap - arch = "sm_$(cap.major)$(cap.minor)" + (; ptx, sm) = job.config.params + cap = base_version(sm) + arch = cpu_name(sm) # validate use of parameter memory argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt @@ -288,7 +315,7 @@ function compile(@nospecialize(job::CompilerJob)) end if param_usage > param_limit msg = """Kernel invocation uses too much parameter memory. - $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by sm_$(cap.major)$(cap.minor) / PTX v$(ptx.major).$(ptx.minor).""" + $(Base.format_bytes(param_usage)) exceeds the $(Base.format_bytes(param_limit)) limit imposed by $(arch) / PTX v$(ptx.major).$(ptx.minor).""" try details = "\n\nRelevant parameters:" diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index 130d049e7c..bfb7d963ce 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} = ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :arch, :cap, :ptx] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream] @@ -433,7 +433,12 @@ The following keyword arguments are supported: - `name`: override the name that the kernel will have in the generated code - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals -- `cap` and `ptx`: to override the compute capability and PTX version to compile for +- `arch` and `ptx`: override the GPU architecture (matching nvcc/ptxas `-arch`) and the + PTX ISA version to compile for. `arch` accepts either an [`SMVersion`](@ref) via the + `sm"..."` string macro (e.g. `arch=sm"103a"` for architecture-accelerated codegen on + CC 10.3, or `arch=sm"100f"` for family-portable Blackwell codegen) or a `VersionNumber` + (e.g. `arch=v"10.3"`, treated as baseline / forward-compatible). The old kwarg name + `cap=` is accepted as a deprecated alias. The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/CUDACore/src/compiler/sm.jl b/CUDACore/src/compiler/sm.jl new file mode 100644 index 0000000000..86f566766a --- /dev/null +++ b/CUDACore/src/compiler/sm.jl @@ -0,0 +1,144 @@ +export SMVersion, @sm_str + +""" + SMVersion(major, minor, [feature_set]) + SMVersion(s::AbstractString) + SMVersion(v::VersionNumber) + SMVersion(sm::SMVersion) + +A PTX compilation target, identifying a CUDA compute capability together with the +subtarget feature set selected by the suffix on its `.target` directive. Printed and +parsed in NVIDIA's compact form -- `sm"90"` for compute capability 9.0, `sm"103a"` +for 10.3 architecture-accelerated, etc. -- to mirror the `.target sm_NN[a|f]` +notation in the PTX ISA reference and to distinguish visually from a device-level +`VersionNumber` like `v"9.0"`. + +The single-argument constructors normalize various inputs to an `SMVersion`: + +- `SMVersion(::AbstractString)` parses the compact form, with or without the `sm_` + prefix (so e.g. `SMVersion("sm_103a")` and `SMVersion("103a")` both work). +- `SMVersion(::VersionNumber)` promotes a plain compute-capability version to a + baseline `SMVersion` (`SMVersion(v"10.3") == SMVersion(10, 3, :baseline)`). +- `SMVersion(::SMVersion)` is the identity (idempotent). + +This is what lets `@cuda arch=...` accept `v"10.3"`, `sm"103a"`, `"sm_103a"`, or +an already-constructed `SMVersion` interchangeably. + +`feature_set` is one of: + +- `:baseline` (no suffix, e.g. `sm_90`) — forward-compatible (the "onion model"): + PTX compiled for `sm_X` runs on any `sm_Y` with `Y >= X`. +- `:family` (`f` suffix, e.g. `sm_100f`) — same-major-family-portable: PTX runs on + any device in the same architecture family (currently == same major version) at + or above this CC. +- `:arch` (`a` suffix, e.g. `sm_90a`) — locked to one exact CC: PTX runs only on + devices with exactly this compute capability, but in exchange gets access to + architecture-accelerated features. + +See NVIDIA's PTX ISA reference under `.target` for the full compatibility rules, +and `lib/Target/NVPTX/NVPTX.td` in LLVM for the corresponding subtarget feature +definitions. + +Public fields: +- `sm.major::Int` +- `sm.minor::Int` +- `sm.feature_set::Symbol` + +See also [`@sm_str`](@ref) for an ergonomic string-macro constructor. + +# Examples +```julia +julia> SMVersion(9, 0) # baseline +sm"90" + +julia> SMVersion(9, 0, :arch) +sm"90a" + +julia> sm"100f" == SMVersion(10, 0, :family) +true +``` +""" +struct SMVersion + major::Int + minor::Int + feature_set::Symbol + + function SMVersion(major::Integer, minor::Integer, feature_set::Symbol = :baseline) + feature_set in (:baseline, :family, :arch) || + error("SMVersion feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))") + return new(Int(major), Int(minor), feature_set) + end +end + +function Base.parse(::Type{SMVersion}, s::AbstractString) + # Mirrors NVIDIA's `sm_NN[a|f]` notation: the last digit before the optional suffix + # is the minor, everything before it is the major. Always one minor digit (NVIDIA + # has never minted a CC with minor >= 10, and rolls the major over instead). The + # optional `sm_` prefix is accepted so PTX-tool output / config strings can pass + # straight through. + raw = startswith(s, "sm_") ? SubString(s, 4) : s + m = match(r"^(\d+)(\d)([af]?)$", raw) + m === nothing && error("invalid sm version string: $(repr(s)); expected e.g. \"103\", \"sm_103a\", or \"100f\"") + major = parse(Int, m.captures[1]) + minor = parse(Int, m.captures[2]) + fs = m.captures[3] == "a" ? :arch : + m.captures[3] == "f" ? :family : :baseline + return SMVersion(major, minor, fs) +end + +# Single-argument constructor: the universal normalizer for accepting an `arch`/`cap`-like +# argument. Identity for SMVersion; baseline-promotes a plain VersionNumber; parses a +# string (with or without the `sm_` prefix). +SMVersion(sm::SMVersion) = sm +SMVersion(v::VersionNumber) = SMVersion(v.major, v.minor, :baseline) +SMVersion(s::AbstractString) = Base.parse(SMVersion, s) + +# Suffix on the LLVM CPU name / `.target` directive +suffix(sm::SMVersion) = sm.feature_set === :arch ? "a" : + sm.feature_set === :family ? "f" : "" + +# LLVM CPU / PTX `.target` name (e.g. "sm_103a"). +cpu_name(sm::SMVersion) = "sm_$(sm.major)$(sm.minor)$(suffix(sm))" + +# Drop the feature set to recover the base compute-capability `VersionNumber`, +# usable against the version-keyed compatibility databases. +base_version(sm::SMVersion) = VersionNumber(sm.major, sm.minor) + +# Would a cubin compiled for `sm` actually load and run on a device with capability +# `dev_cap`? Per NVIDIA's PTX ISA reference (.target directive): +# - baseline: forward-compatible (onion model) -- any sm_X runs on sm_Y for Y >= X. +# - family: same architecture family (currently == same major) and forward-portable +# within the family. +# - arch: locked to one exact CC; cubin only loads on devices with that exact cap. +function runs_on(sm::SMVersion, dev_cap::VersionNumber) + if sm.feature_set === :arch + return base_version(sm) == dev_cap + elseif sm.feature_set === :family + return sm.major == dev_cap.major && base_version(sm) <= dev_cap + else # :baseline + return base_version(sm) <= dev_cap + end +end + + +Base.show(io::IO, sm::SMVersion) = print(io, "sm\"", sm.major, sm.minor, suffix(sm), "\"") + +""" + @sm_str + +String macro used to parse a string to an [`SMVersion`](@ref). Accepts NVIDIA's +compact `sm_NN[a|f]` notation (with or without the `sm_` prefix): `sm"90"` for +baseline, `sm"90a"` for architecture-accelerated, `sm"100f"` for family-specific. +Equivalent to calling `SMVersion(str)`; parses at macro-expansion time, so the +resulting `SMVersion` is a compile-time constant in the surrounding expression. + +# Examples +```julia +julia> sm"103a" +sm"103a" + +julia> sm"100f" == SMVersion(10, 0, :family) +true +``` +""" +macro sm_str(s); SMVersion(s); end diff --git a/CUDACore/src/device/intrinsics/version.jl b/CUDACore/src/device/intrinsics/version.jl index 6114833eeb..ea66448bda 100644 --- a/CUDACore/src/device/intrinsics/version.jl +++ b/CUDACore/src/device/intrinsics/version.jl @@ -1,8 +1,8 @@ # device intrinsics for querying the compute SimpleVersion and PTX ISA version -export compute_capability, ptx_isa_version +export compute_capability, ptx_isa_version, target_feature_set -for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"] +for var in ["sm_major", "sm_minor", "sm_features", "ptx_major", "ptx_minor"] @eval @device_function @inline $(Symbol(var))() = Base.llvmcall( $("""@$var = external global i32 @@ -17,3 +17,16 @@ end @device_function @inline compute_capability() = SimpleVersion(sm_major(), sm_minor()) @device_function @inline ptx_isa_version() = SimpleVersion(ptx_major(), ptx_minor()) +# Feature set encoded in the `.target` directive: one of `:baseline`, `:family`, `:arch`. +# (NVIDIA's PTX ISA reference: ".target specifies the set of features in the target +# architecture for which the current PTX code was generated.") GPUCompiler stamps the +# encoding in via the `sm_features` LLVM global, using `GPUCompiler.TargetFeatureSet`; +# the integer load + chained compare folds away after LLVM inlines the constant, so +# user code like `if target_feature_set() === :arch ... end` resolves to a single +# branch in the PTX output. +@device_function @inline function target_feature_set() + f = sm_features() + return f == UInt32(GPUCompiler.ArchFeatures) ? :arch : + f == UInt32(GPUCompiler.FamilyFeatures) ? :family : :baseline +end + diff --git a/CUDACore/src/device/runtime.jl b/CUDACore/src/device/runtime.jl index cd3687f0ea..6739d27d68 100644 --- a/CUDACore/src/device/runtime.jl +++ b/CUDACore/src/device/runtime.jl @@ -12,14 +12,16 @@ function precompile_runtime() f = ()->return mi = methodinstance(typeof(f), Tuple{}) - caps = llvm_compat().cap + # `.cap` is now keyed by `SMVersion` and includes variants; runtime caches are + # feature_set-agnostic, so we only warm the baseline entries. + sms = filter(sm -> sm.feature_set === :baseline, llvm_compat().sm) ptx = maximum(llvm_compat().ptx) JuliaContext() do ctx - for cap in caps, debuginfo in [false, true] + for sm in sms, debuginfo in [false, true] # NOTE: this often runs when we don't have a functioning set-up, # so we don't use `compiler_config` which requires NVML - target = PTXCompilerTarget(; cap, ptx, debuginfo) - params = CUDACompilerParams(; cap, ptx) + target = PTXCompilerTarget(; cap=base_version(sm), ptx, debuginfo) + params = CUDACompilerParams(; sm, ptx) config = CompilerConfig(target, params) job = CompilerJob(mi, config) GPUCompiler.load_runtime(job) diff --git a/CUDACore/src/precompile.jl b/CUDACore/src/precompile.jl index 58ae94f536..8817aa74cc 100644 --- a/CUDACore/src/precompile.jl +++ b/CUDACore/src/precompile.jl @@ -13,11 +13,16 @@ if :NVPTX in LLVM.backends() end llvm_support = llvm_compat() - llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap)) + # `.sm` is `Set{SMVersion}` (with variants); pick the highest baseline + # entry <= v"7.5" for a portable precompile artifact. + llvm_sm = argmax(base_version, + filter(sm -> sm.feature_set === :baseline && + base_version(sm) <= v"7.5", + llvm_support.sm)) llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx)) - target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true) - params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx) + target = PTXCompilerTarget(; cap=base_version(llvm_sm), ptx=llvm_ptx, debuginfo=true) + params = CUDACompilerParams(; sm=llvm_sm, ptx=llvm_ptx) config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false) tt = Tuple{CuDeviceArray{Float32,1,AS.Global}} diff --git a/CUDATools/src/utilities.jl b/CUDATools/src/utilities.jl index 079ad71eb3..c14453cb7a 100644 --- a/CUDATools/src/utilities.jl +++ b/CUDATools/src/utilities.jl @@ -142,5 +142,27 @@ function versioninfo(io::IO=stdout) query_cuda() end println(io, " $(i-1): $str (sm_$(cap.major)$(cap.minor), $(Base.format_bytes(mem.free)) / $(Base.format_bytes(mem.total)) available)") + + # report the default compilation target we'd select for this device + config = try + CUDACore.compiler_config(dev) + catch + nothing + end + if config !== nothing + ptxas_sm = config.params.sm + ptxas_ptx = config.params.ptx + llvm_sm = CUDACore.SMVersion(config.target.cap.major, + config.target.cap.minor, + config.target.feature_set) + llvm_ptx = config.target.ptx + ptxas_str = "$(CUDACore.cpu_name(ptxas_sm)) / PTX $(ptxas_ptx.major).$(ptxas_ptx.minor)" + if llvm_sm == ptxas_sm && llvm_ptx == ptxas_ptx + println(io, " compiles to $ptxas_str") + else + llvm_str = "$(CUDACore.cpu_name(llvm_sm)) / PTX $(llvm_ptx.major).$(llvm_ptx.minor)" + println(io, " compiles to $ptxas_str (LLVM: $llvm_str)") + end + end end end diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md index a7ce178a3e..a2c493d789 100644 --- a/docs/src/api/compiler.md +++ b/docs/src/api/compiler.md @@ -25,6 +25,14 @@ registers memory ``` +The PTX compilation target is identified by an `SMVersion`, constructed via the +`sm"..."` string macro: + +```@docs +SMVersion +@sm_str +``` + To plug in alternative compiler back-ends (e.g. cuTile.jl), `@cuda` dispatches through a small protocol: diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index 6049552626..e347cf8300 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -232,11 +232,33 @@ function main() - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" + + # Run once to validate: the result must be finite and the L1 sum must + # match a baked-in reference computed from this same StableRNG(123) + # seed. cuTile/perf/volumerhs.jl uses the same reference so the two + # implementations can be cross-checked. + CUDA.@sync kernel(rhs, Q, vgeo, DFloat(grav), D, nelem; + threads=threads, blocks=nelem) + rhs_h = Array(rhs) + @assert all(isfinite, rhs_h) "kernel produced non-finite values" + rsum = sum(rhs_h) + ref = 1.4227473f10 + rel = abs(rsum - ref) / abs(ref) + @assert rel < 1f-3 "rhs checksum off by $rel (got $rsum, expected $ref)" + @info "validation passed" rhs_sum=rsum reference=ref rel_err=rel + fill!(rhs, 0) + results = @benchmark begin + # zero rhs each iteration so accumulation stays meaningful + fill!($rhs, 0) CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; threads=$threads, blocks=$nelem) end + bytes = nelem * 28 * Nq^3 * sizeof(DFloat) + bw = bytes / (minimum(results).time / 1e9) / 1e9 + @info "SIMT volumerhs! benchmark" min_ms=minimum(results).time/1e6 median_ms=median(results).time/1e6 effective_BW="$(round(Int, bw)) GB/s" + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them CUDA.unsafe_free!(rhs) CUDA.unsafe_free!(Q) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 02f275aa7d..db431db4ed 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -255,10 +255,51 @@ end @test !success(run_ptxas(asm_pre, "sm_75")) - asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0") + asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90") @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) + + # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same + # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", sm"90a") + @test occursin(".target sm_90a", asm_arch) + @test success(run_ptxas(asm_arch, "sm_90a")) + + # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line. + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", sm"100f") + @test occursin(".target sm_100f", asm_family) + @test success(run_ptxas(asm_family, "sm_100f")) +end + +@testset "SMVersion and sm\"...\" macro" begin + @test sm"90" == SMVersion(9, 0, :baseline) + @test sm"90a" == SMVersion(9, 0, :arch) + @test sm"100f" == SMVersion(10, 0, :family) + # printing roundtrips via the macro form + @test sprint(show, sm"103a") == "sm\"103a\"" + @test sprint(show, sm"100") == "sm\"100\"" + # cpu_name reflects feature_set + @test CUDACore.cpu_name(sm"90") == "sm_90" + @test CUDACore.cpu_name(sm"90a") == "sm_90a" + @test CUDACore.cpu_name(sm"100f") == "sm_100f" + # base_version drops the suffix back to a comparable VersionNumber + @test CUDACore.base_version(sm"103a") == v"10.3" + # constructor rejects bogus feature_set + @test_throws ErrorException SMVersion(9, 0, :bogus) + # macro rejects malformed strings + @test_throws ErrorException parse(SMVersion, "10.3a") # dotted form (NVIDIA uses dotless) + @test_throws ErrorException parse(SMVersion, "100x") # unknown suffix + @test_throws ErrorException parse(SMVersion, "1") # only one digit (need at least major + minor) + @test_throws ErrorException parse(SMVersion, "") # empty + + # `SMVersion(x)` as the universal normalizer: + @test SMVersion(sm"103a") === sm"103a" # identity + @test SMVersion(v"10.3") == SMVersion(10, 3, :baseline) # VersionNumber → baseline + @test SMVersion("103a") == sm"103a" # bare string + @test SMVersion("sm_103a") == sm"103a" # accepts NVIDIA prefix + # the macro is just a parse-time call to the constructor + @test sm"103a" == SMVersion("103a") end end diff --git a/test/core/execution.jl b/test/core/execution.jl index edf589913d..387af3547e 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -50,17 +50,84 @@ end @cuda threads=2 dummy() # sm_10 isn't supported by LLVM - @test_throws "not supported by LLVM" @cuda launch=false cap=v"1.0" dummy() + @test_throws "not supported by LLVM" @cuda launch=false arch=sm"10" dummy() # sm_20 is, but not by any CUDA version we support - @test_throws "Failed to compile PTX code" @cuda launch=false cap=v"2.0" dummy() + @test_throws "Failed to compile PTX code" @cuda launch=false arch=sm"20" dummy() # there isn't any capability other than the device's that's guaruanteed to work - @cuda launch=false cap=capability(device()) dummy() + dev_cap = capability(device()) + dev_sm = SMVersion(dev_cap.major, dev_cap.minor) + @cuda launch=false arch=dev_sm dummy() + # `arch=` also accepts a plain `VersionNumber` -- treated as baseline. Equivalent + # to constructing the SMVersion directly. + @cuda launch=false arch=dev_cap dummy() # but we should be able to see it in the generated PTX code - asm = sprint(io->CUDA.code_ptx(io, dummy, (); cap=v"5.0")) + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50")) + @test contains(asm, ".target sm_50") + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0")) @test contains(asm, ".target sm_50") + # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the + # `.version` line should match what was asked for, independently of what LLVM and + # ptxas would natively pick. asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3")) @test contains(asm, ".version 6.3") + + # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it + # used to be); a clearly out-of-range value must error at config time. + @test_throws "not supported" @cuda launch=false ptx=v"99.0" dummy() + + # feature_set is selected by the suffix on the sm"..." string; the suffix should + # surface in the .target directive in the PTX output. The cuda-side `.target` is + # the variant regardless of LLVM support -- the mcgen rewrite stamps it in even + # when LLVM clamped to baseline for codegen. + sm_a = SMVersion(dev_cap.major, dev_cap.minor, :arch) + sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) + + if dev_cap >= v"9.0" + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a)) + @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + # arch-specific cubin should also actually launch on the matching device + @cuda arch=sm_a dummy() + end + if dev_cap >= v"10.0" + asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f)) + @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))") + @cuda arch=sm_f dummy() + end + + # `cap=` is the deprecated alias for `arch=`; check the depwarn fires while + # the path still produces the right PTX. + @test_deprecated sprint(io->CUDA.code_ptx(io, dummy, (); cap=sm"50")) + + # With no explicit `arch=`, we default to architecture-specific code paths on CC >=9.0 + # since we know the exact device. The cuda-side `.target` is the variant regardless of + # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs. + if dev_cap >= v"9.0" + asm = sprint(io->CUDA.code_ptx(io, dummy, ())) + @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + end + + # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built + # for (not the cuda-side .target): when LLVM doesn't natively support the exact variant, + # we fall back to baseline LLVM, so the global reflects baseline. The if-chain folds at + # codegen time, so the launched kernel writes a single constant. + function read_feature_set!(out) + @inbounds out[1] = if target_feature_set() === :arch + UInt32(2) + elseif target_feature_set() === :family + UInt32(1) + else + UInt32(0) + end + return + end + out = CuArray{UInt32}([typemax(UInt32)]) + @cuda threads=1 read_feature_set!(out) + # arch features come through `target_feature_set()` only when LLVM natively supported + # the variant; otherwise we fell back to baseline LLVM and the global reflects that. + arch_in_llvm = sm_a in CUDACore.llvm_sm_support(CUDACore.LLVM.version()) + expected = dev_cap >= v"9.0" && arch_in_llvm ? UInt32(2) : UInt32(0) + @test Array(out)[1] == expected end