From a82a8447deebdf0056ab81bd5805876a373a8c27 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 13 Feb 2023 09:13:54 -0500 Subject: [PATCH 01/32] Implement KernelAbstractions backend in CUDA.jl Formerly know as CUDAKernels.jl --- Manifest.toml | 62 +++++++-- Project.toml | 3 + src/CUDA.jl | 4 + src/CUDAKernels.jl | 254 +++++++++++++++++++++++++++++++++++++ test/Project.toml | 3 + test/kernelabstractions.jl | 16 +++ 6 files changed, 334 insertions(+), 8 deletions(-) create mode 100644 src/CUDAKernels.jl create mode 100644 test/kernelabstractions.jl diff --git a/Manifest.toml b/Manifest.toml index 6225d8d8a7..2d634a920b 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,9 +2,9 @@ [[AbstractFFTs]] deps = ["ChainRulesCore", "LinearAlgebra"] -git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409" +git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.2.1" +version = "1.3.1" [[Adapt]] deps = ["LinearAlgebra", "Requires"] @@ -18,6 +18,12 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + [[BFloat16s]] deps = ["LinearAlgebra", "Printf", "Random", "Test"] git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" @@ -64,9 +70,9 @@ version = "0.1.6" [[Compat]] deps = ["Dates", "LinearAlgebra", "UUIDs"] -git-tree-sha1 = "61fdd77467a5c3ad071ef8277ac6bd6af7dd4c04" +git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.6.0" +version = "4.6.1" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -87,9 +93,9 @@ deps = ["ArgTools", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [[ExprTools]] -git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d" +git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.8" +version = "0.1.9" [[GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] @@ -105,9 +111,9 @@ version = "0.1.4" [[GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "95185985a5d2388c6d0fedb06181ad4ddd40e0cb" +git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.17.2" +version = "0.17.3" [[InteractiveUtils]] deps = ["Markdown"] @@ -130,6 +136,12 @@ git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.4.1" +[[KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "17d0bb94eef881b09c57967be12cca70fefb3304" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.0" + [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04" @@ -178,6 +190,12 @@ version = "0.3.23" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.10" + [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" @@ -253,6 +271,12 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[SnoopPrecompile]] +deps = ["Preferences"] +git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c" +uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c" +version = "1.0.3" + [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" @@ -266,6 +290,17 @@ git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" version = "2.2.0" +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] +git-tree-sha1 = "2d7d9e1ddadc8407ffd460e24218e37ef52dd9a3" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.5.16" + +[[StaticArraysCore]] +git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.0" + [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -295,6 +330,17 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" +[[UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.0" + [[Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" diff --git a/Project.toml b/Project.toml index b18100dac1..19d8a411b3 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ CompilerSupportLibraries_jll = "e66e0078-7015-5450-92f7-15fbd957f2ae" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -28,6 +29,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Requires = "ae029012-a4dd-5104-9daa-d747884805df" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] AbstractFFTs = "0.4, 0.5, 1.0" @@ -47,4 +49,5 @@ RandomNumbers = "1.5.3" Reexport = "0.2, 1.0" Requires = "0.5, 1.0" SpecialFunctions = "1.3, 2" +UnsafeAtomicsLLVM = "0.1" julia = "1.6" diff --git a/src/CUDA.jl b/src/CUDA.jl index 5ddccec286..0e918eae14 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -107,6 +107,10 @@ include("../lib/nvml/NVML.jl") const has_nvml = NVML.has_nvml export NVML, has_nvml +# KernelAbstractions +include("CUDAKernels.jl") +export CUDABackend + include("precompile.jl") end diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl new file mode 100644 index 0000000000..b43a1af8a5 --- /dev/null +++ b/src/CUDAKernels.jl @@ -0,0 +1,254 @@ +module CUDAKernels + +import KernelAbstractions +import CUDA +import UnsafeAtomicsLLVM +import GPUCompiler + +struct CUDABackend <: KernelAbstractions.GPU + prefer_blocks::Bool + always_inline::Bool +end +CUDABackend(;prefer_blocks=false, always_inline=false) = CUDABackend(prefer_blocks, always_inline) + +export CUDABackend + +KernelAbstractions.allocate(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.CuArray{T}(undef, dims) +KernelAbstractions.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims) +KernelAbstractions.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims) + +# Import through parent +import KernelAbstractions: StaticArrays, Adapt +import .StaticArrays: MArray + +KernelAbstractions.get_backend(::CUDA.CuArray) = CUDABackend() +KernelAbstractions.get_backend(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDABackend() + +KernelAbstractions.synchronize(::CUDABackend) = CUDA.synchronize() + +### +# copyto! +### +# - IdDict does not free the memory +# - WeakRef dict does not unique the key by objectid +const __pinned_memory = Dict{UInt64, WeakRef}() + +function __pin!(a) + # use pointer instead of objectid? + oid = objectid(a) + if haskey(__pinned_memory, oid) && __pinned_memory[oid].value !== nothing + return nothing + end + ad = CUDA.Mem.register(CUDA.Mem.Host, pointer(a), sizeof(a)) + finalizer(_ -> CUDA.Mem.unregister(ad), a) + __pinned_memory[oid] = WeakRef(a) + return nothing +end + +function KernelAbstractions.copyto!(::CUDABackend, A, B) + A isa Array && __pin!(A) + B isa Array && __pin!(B) + + GC.@preserve A B begin + destptr = pointer(A) + srcptr = pointer(B) + N = length(A) + unsafe_copyto!(destptr, srcptr, N, async=true) + end + return A +end + +import KernelAbstractions: Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config + +### +# Kernel launch +### +function launch_config(kernel::Kernel{CUDABackend}, ndrange, workgroupsize) + if ndrange isa Integer + ndrange = (ndrange,) + end + if workgroupsize isa Integer + workgroupsize = (workgroupsize, ) + end + + # partition checked that the ndrange's agreed + if KernelAbstractions.ndrange(kernel) <: StaticSize + ndrange = nothing + end + + iterspace, dynamic = if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && + workgroupsize === nothing + # use ndrange as preliminary workgroupsize for autotuning + partition(kernel, ndrange, ndrange) + else + partition(kernel, ndrange, workgroupsize) + end + + return ndrange, workgroupsize, iterspace, dynamic +end + +function threads_to_workgroupsize(threads, ndrange) + total = 1 + return map(ndrange) do n + x = min(div(threads, total), n) + total *= x + return x + end +end + +function (obj::Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=nothing) + backend = KernelAbstractions.backend(obj) + + ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize) + # this might not be the final context, since we may tune the workgroupsize + ctx = mkcontext(obj, ndrange, iterspace) + + # If the kernel is statically sized we can tell the compiler about that + if KernelAbstractions.workgroupsize(obj) <: StaticSize + maxthreads = prod(KernelAbstractions.get(KernelAbstractions.workgroupsize(obj))) + else + maxthreads = nothing + end + + kernel = CUDA.@cuda launch=false always_inline=backend.always_inline maxthreads=maxthreads obj.f(ctx, args...) + + # figure out the optimal workgroupsize automatically + if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing + config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange)) + if backend.prefer_blocks + # Prefer blocks over threads + threads = min(prod(ndrange), config.threads) + # XXX: Some kernels performs much better with all blocks active + cu_blocks = max(cld(prod(ndrange), threads), config.blocks) + threads = cld(prod(ndrange), cu_blocks) + else + threads = config.threads + end + + workgroupsize = threads_to_workgroupsize(threads, ndrange) + iterspace, dynamic = partition(obj, ndrange, workgroupsize) + ctx = mkcontext(obj, ndrange, iterspace) + end + + nblocks = length(blocks(iterspace)) + threads = length(workitems(iterspace)) + + if nblocks == 0 + return nothing + end + + # Launch kernel + kernel(ctx, args...; threads=threads, blocks=nblocks) + + return nothing +end + +# list of overrides (only for Julia 1.6) +const overrides = Expr[] + +macro device_override(ex) + ex = macroexpand(__module__, ex) + if Meta.isexpr(ex, :call) + @show ex = eval(ex) + error() + end + code = quote + $GPUCompiler.@override($CUDA.method_table, $ex) + end + if isdefined(Base.Experimental, Symbol("@overlay")) + return esc(code) + else + push!(overrides, code) + return + end +end + +function __init__() + precompiling = ccall(:jl_generating_output, Cint, ()) != 0 + precompiling && return + # register device overrides + eval(Expr(:block, overrides...)) + empty!(overrides) +end + +import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices +import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print +import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds + +function mkcontext(kernel::Kernel{CUDABackend}, _ndrange, iterspace) + CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace) +end + +@device_override @inline function __index_Local_Linear(ctx) + return CUDA.threadIdx().x +end + +@device_override @inline function __index_Group_Linear(ctx) + return CUDA.blockIdx().x +end + +@device_override @inline function __index_Global_Linear(ctx) + I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x) + # TODO: This is unfortunate, can we get the linear index cheaper + @inbounds LinearIndices(__ndrange(ctx))[I] +end + +@device_override @inline function __index_Local_Cartesian(ctx) + @inbounds workitems(__iterspace(ctx))[CUDA.threadIdx().x] +end + +@device_override @inline function __index_Group_Cartesian(ctx) + @inbounds blocks(__iterspace(ctx))[CUDA.blockIdx().x] +end + +@device_override @inline function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x) +end + +@device_override @inline function __validindex(ctx) + if __dynamic_checkbounds(ctx) + I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x) + return I in __ndrange(ctx) + else + return true + end +end + +import KernelAbstractions: groupsize, __groupsize, __workitems_iterspace +import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize, __size + +### +# GPU implementation of shared memory +### + +@device_override @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} + CUDA.CuStaticSharedArray(T, Dims) +end + +### +# GPU implementation of scratch memory +# - private memory for each workitem +### + +@device_override @inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} + MArray{__size(Dims), T}(undef) +end + +@device_override @inline function __synchronize() + CUDA.sync_threads() +end + +@device_override @inline function __print(args...) + CUDA._cuprint(args...) +end + +### +# GPU implementation of const memory +### + +Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental.Const(a) + +# Argument conversion +KernelAbstractions.argconvert(k::Kernel{CUDABackend}, arg) = CUDA.cudaconvert(arg) + +end diff --git a/test/Project.toml b/test/Project.toml index 95252c2eda..ec5354dea7 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -7,7 +7,9 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -15,6 +17,7 @@ REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl new file mode 100644 index 0000000000..05f5d36978 --- /dev/null +++ b/test/kernelabstractions.jl @@ -0,0 +1,16 @@ +import KernelAbstractions +using Test + +include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl")) + +using CUDA +using CUDA.CUDAKernels + +if CUDA.functional() + CUDA.versioninfo() + CUDA.allowscalar(false) + Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CUDA.CuDeviceArray) + for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false)) + Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CUDA.CuDeviceArray) + end +end From f510c5070cd156b2bde961cee4cbf8871ba0c415 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 15:31:22 -0500 Subject: [PATCH 02/32] Use Atomix --- Project.toml | 3 + src/CUDAKernels.jl | 3 +- src/device/intrinsics.jl | 16 ++ src/device/intrinsics/atomics.jl | 264 +++++++++++++++-------- src/device/intrinsics/synchronization.jl | 139 +++++++++--- test/device/intrinsics/atomics.jl | 2 +- 6 files changed, 305 insertions(+), 122 deletions(-) diff --git a/Project.toml b/Project.toml index 19d8a411b3..4e35c695f5 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "4.0.1" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc" @@ -29,11 +30,13 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Requires = "ae029012-a4dd-5104-9daa-d747884805df" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] AbstractFFTs = "0.4, 0.5, 1.0" Adapt = "3.3" +Atomix = "0.1" BFloat16s = "0.2, 0.3, 0.4" CEnum = "0.2, 0.3, 0.4" CUDA_Driver_jll = "0.2" diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index b43a1af8a5..628f528f75 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -2,8 +2,6 @@ module CUDAKernels import KernelAbstractions import CUDA -import UnsafeAtomicsLLVM -import GPUCompiler struct CUDABackend <: KernelAbstractions.GPU prefer_blocks::Bool @@ -146,6 +144,7 @@ end # list of overrides (only for Julia 1.6) const overrides = Expr[] +import GPUCompiler macro device_override(ex) ex = macroexpand(__module__, ex) if Meta.isexpr(ex, :call) diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl index 443a7fd420..30e797cbd7 100644 --- a/src/device/intrinsics.jl +++ b/src/device/intrinsics.jl @@ -3,6 +3,22 @@ # special intrinsics for writing version-dependent code include("intrinsics/version.jl") +abstract type SyncScope end +struct SystemScope <: SyncScope end +struct DeviceScope <: SyncScope end +struct BlockScope <: SyncScope end + +const system_scope = SystemScope() +const device_scope = DeviceScope() +const block_scope = BlockScope() + +import UnsafeAtomics +using UnsafeAtomics.Internal: LLVMOrdering +using UnsafeAtomics: unordered, monotonic, acquire, release, acq_rel, seq_cst +# Note CUDA C++ has also consume ordering which LLVM does not support +# monotonic -> relaxed +# unordered -> ??? maybe weak + # extensions to the C language include("intrinsics/memory_shared.jl") include("intrinsics/indexing.jl") diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 15b8c10e39..2021529f14 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -1,5 +1,7 @@ # Atomic Functions (B.12) +# TODO replace the below with UnsafeAtomicsLLVM if possible + # # Low-level intrinsics # @@ -357,117 +359,193 @@ This operation is only supported for values of type Int32. """ atomic_dec! +asm(::Type{LLVMOrdering{:monotonic}}) = :relaxed +asm(::Type{LLVMOrdering{Order}}) where Order = Order + +asm(::Type{SystemScope}) = :sys +asm(::Type{DeviceScope}) = :gpu +asm(::Type{BlockScope}) = :cta + +for (order, scope) in Iterators.product((LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope)) + asm_b64 = "ld.$(asm(order)).$(asm(scope)).b64 \$0, [\$1];" + asm_b32 = "ld.$(asm(order)).$(asm(scope)).b32 \$0, [\$1];" + @eval @inline __load_64(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} = + @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr) + @eval @inline __load_32(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} = + @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr) +end +@inline function __load(ptr::LLVMPtr{T}, order, scope) where T + if sizeof(T) == 4 + __load_32(ptr, order, scope) + elseif sizeof(T) == 8 + __load_64(ptr, order, scope) + else + assert(false) + end +end -# -# High-level interface -# - -# prototype of a high-level interface for performing atomic operations on arrays -# -# this design could be generalized by having atomic {field,array}{set,ref} accessors, as -# well as acquire/release operations to implement the fallback functionality where any -# operation can be applied atomically. +# Could be done using LLVM. +@inline __load_volatile_64(ptr::LLVMPtr{T, AS}) where {T, AS} = + @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) +@inline __load_volatile_32(ptr::LLVMPtr{T, AS}) where {T, AS} = + @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + +@inline function __load_volatile(ptr::LLVMPtr{T}) where T + if sizeof(T) == 4 + __load_volatile_32(ptr) + elseif sizeof(T) == 8 + __load_volatile_64(ptr) + else + assert(false) + end +end -if VERSION <= v"1.7-" -export @atomic +@inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T + if order == acq_rel || order == release + assert(false) + end + if compute_capability() >= sv"7.0" + if order == monotonic + val = __load(ptr, monotonic, scope) + return val + end + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + val = __load(ptr, acquire, scope) + return val + else + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + val = __load_volatile(ptr) + if order == monotonic + return val + end + atomic_thread_fence(order, scope) + return val + end end -const inplace_ops = Dict( - :(+=) => :(+), - :(-=) => :(-), - :(*=) => :(*), - :(/=) => :(/), - :(\=) => :(\), - :(%=) => :(%), - :(^=) => :(^), - :(&=) => :(&), - :(|=) => :(|), - :(⊻=) => :(⊻), - :(>>>=) => :(>>>), - :(>>=) => :(>>), - :(<<=) => :(<<), -) +for (order, scope) in Iterators.product((LLVMOrdering{:release}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope)) + asm_b64 = "st.$(asm(order)).$(asm(scope)).b64 [\$0], \$1;" + asm_b32 = "st.$(asm(order)).$(asm(scope)).b32 [\$0], \$1;" + @eval @inline __store_64!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} = + @asmcall($asm_b64, "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) + @eval @inline __store_32!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} = + @asmcall($asm_b32, "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) +end -struct AtomicError <: Exception - msg::AbstractString +@inline function __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T + if sizeof(T) == 4 + __store_32!(ptr, val, order, scope) + elseif sizeof(T) == 8 + __store_64!(ptr, val, order, scope) + else + assert(false) + end end -Base.showerror(io::IO, err::AtomicError) = - print(io, "AtomicError: ", err.msg) +# Could be done using LLVM. +@inline __store_volatile_32!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} = + @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) +@inline __store_volatile_64!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} = + @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) + +@inline function __store_volatile!(ptr::LLVMPtr{T}, val::T) where T + if sizeof(T) == 4 + __store_volatile_32!(ptr, val) + elseif sizeof(T) == 8 + __store_volatile_64!(ptr, val) + else + assert(false) + end +end -""" - @atomic a[I] = op(a[I], val) - @atomic a[I] ...= val - -Atomically perform a sequence of operations that loads an array element `a[I]`, performs the -operation `op` on that value and a second value `val`, and writes the result back to the -array. This sequence can be written out as a regular assignment, in which case the same -array element should be used in the left and right hand side of the assignment, or as an -in-place application of a known operator. In both cases, the array reference should be pure -and not induce any side-effects. - -!!! warn - This interface is experimental, and might change without warning. Use the lower-level - `atomic_...!` functions for a stable API, albeit one limited to natively-supported ops. -""" -macro atomic(ex) - # decode assignment and call - if ex.head == :(=) - ref = ex.args[1] - rhs = ex.args[2] - Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call")) - op = rhs.args[1] - if rhs.args[2] != ref - throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side")) +@inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T + if order == acq_rel || order == acquire # || order == consume + assert(false) + end + if compute_capability() >= sv"7.0" + if order == release + __store!(ptr, val, release, scope) + return + end + if order == seq_cst + atomic_thread_fence(seq_cst, scope) end - val = rhs.args[3] - elseif haskey(inplace_ops, ex.head) - op = inplace_ops[ex.head] - ref = ex.args[1] - val = ex.args[2] + __store!(ptr, val, monotonic, scope) else - throw(AtomicError("unknown @atomic expression")) + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + __store_volatile!(ptr, val) end +end - # decode array expression - Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression")) - array = ref.args[1] - indices = Expr(:tuple, ref.args[2:end]...) +# +# High-level interface +# +import Atomix: @atomic, @atomicswap, @atomicreplace +# import UnsafeAtomicsLLVM + +if VERSION <= v"1.7" + export @atomic +end - esc(quote - $atomic_arrayset($array, $indices, $op, $val) - end) +using Atomix: Atomix, IndexableRef + +const CuIndexableRef{Indexable<:CuDeviceArray} = IndexableRef{Indexable} + +@inline function Atomix.get(ref::CuIndexableRef, order) + atomic_load(Atomix.pointer(ref), order) end -# FIXME: make this respect the indexing style -@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} = - atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val)) - -# native atomics -for (op,impl,typ) in [(+, atomic_add!, [UInt32,Int32,UInt64,Int64,Float32]), - (-, atomic_sub!, [UInt32,Int32,UInt64,Int64,Float32]), - (&, atomic_and!, [UInt32,Int32,UInt64,Int64]), - (|, atomic_or!, [UInt32,Int32,UInt64,Int64]), - (⊻, atomic_xor!, [UInt32,Int32,UInt64,Int64]), - (max, atomic_max!, [UInt32,Int32,UInt64,Int64]), - (min, atomic_min!, [UInt32,Int32,UInt64,Int64])] - @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op), - val::T) where {T<:Union{$(typ...)}} = - $impl(pointer(A, I), val) +@inline function Atomix.set!(ref::CuIndexableRef, v, order) + atomic_store!(Atomix.pointer(ref), v, order) end -# native atomics that are not supported on all devices -@inline function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::typeof(+), - val::T) where {T <: Union{Float64}} - ptr = pointer(A, I) - if compute_capability() >= sv"6.0" - atomic_add!(ptr, val) - else - atomic_op!(ptr, op, val) +@inline function Atomix.replace!( + ref::CuIndexableRef, + expected, + desired, + success_ordering, + failure_ordering, +) + # TODO success_ordering and failure + ptr = Atomix.pointer(ref) + expected = convert(eltype(ref), expected) + desired = convert(eltype(ref), desired) + begin + old = atomic_cas!(ptr, expected, desired) end + return (; old = old, success = old === expected) end -# fallback using compare-and-swap -@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} = - atomic_op!(pointer(A, I), op, val) +@inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP} + x = convert(eltype(ref), x) + ptr = Atomix.pointer(ref) + begin + old = if op === (+) + atomic_add!(ptr, x) + elseif op === (-) + atomic_sub!(ptr, x) + elseif op === (&) + atomic_and!(ptr, x) + elseif op === (|) + atomic_or!(ptr, x) + elseif op === xor + atomic_xor!(ptr, x) + elseif op === min + atomic_min!(ptr, x) + elseif op === max + atomic_max!(ptr, x) + else + error("not implemented") + end + end + return old => op(old, x) +end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index 9c76737e36..1eb881db65 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -83,39 +83,126 @@ the warp. Cvoid, Tuple{UInt32}, convert(UInt32, mask)) end +@inline threadfence(::BlockScope) = threadfence_block() +@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) +@inline threadfence_sc_block() = @asmcall("fence.sc.cta;", "~{memory}", true, Cvoid, Tuple{}) +@inline threadfence_acq_rel_block() = @asmcall("fence.acq_rel.cta;", "~{memory}", true, Cvoid, Tuple{}) + +function atomic_thread_fence(order, scope::BlockScope) + if compute_capability() >= sv"7.0" + if order == seq_cst + threadfence_sc_block() + elseif order == acquire || order == acq_rel || order == release # || order == consume + threadfence_acq_rel_block() + else + assert(false) + end + else + if order == seq_cst || + # order == consume || + order == acquire || + order == acq_rel || + order == release + + threadfence_block() + else + assert(false) + end + end +end + +@inline threadfence(::DeviceScope=device_scope) = threadfence_device() +@inline threadfence_device() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) +@inline threadfence_sc_device() = @asmcall("fence.sc.gpu;", "~{memory}", true, Cvoid, Tuple{}) +@inline threadfence_acq_rel_device() = @asmcall("fence.acq_rel.gpu;", "~{memory}", true, Cvoid, Tuple{}) + +function atomic_thread_fence(order, scope::DeviceScope=device_scope) + if compute_capability() >= sv"7.0" + if order == seq_cst + + threadfence_sc_device() + elseif order == acquire || + # order == consume || + order == acq_rel || + order == release + + threadfence_acq_rel_device() + else + assert(false) + end + else + if order == seq_cst() || + order == consume() || + order == acquire() || + order == acq_rel() || + order == release() + + threadfence_device() + else + assert(false) + end + end +end + +@inline threadfence(::SystemScope) = threadfence_system() +@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) +@inline threadfence_sc_system() = @asmcall("fence.sc.sys;", "~{memory}", true, Cvoid, Tuple{}) +@inline threadfence_acq_rel_system() = @asmcall("fence.acq_rel.sys;", "~{memory}", true, Cvoid, Tuple{}) + +function atomic_thread_fence(order, scope::SystemScope) + if compute_capability() >= sv"7.0" + if order == seq_cst + + threadfence_sc_system() + elseif order == acquire || + # order == consume || + order == acq_rel || + order == release + + threadfence_acq_rel_system() + else + assert(false) + end + else + if order == seq_cst || + # order == consume || + order == acquire || + order == acq_rel || + order == release + + threadfence_system() + else + assert(false) + end + end +end + """ - threadfence_block() + threadfence(::SyncScope=device_scope) A memory fence that ensures that: -- All writes to all memory made by the calling thread before the call to `threadfence_block()` - are observed by all threads in the block of the calling thread as occurring before all writes - to all memory made by the calling thread after the call to `threadfence_block()` -- All reads from all memory made by the calling thread before the call to `threadfence_block()` - are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`. -""" -@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) +- All writes to all memory made by the calling thread before the call to `threadfence(scope)` + are observed by all threads in the scope of the calling thread as occurring before all writes + to all memory made by the calling thread after the call to `threadfence(scope)` +- All reads from all memory made by the calling thread before the call to `threadfence(scope)` + are ordered before all reads from all memory made by the calling thread after the call to `threadfence(scope)`. -""" - threadfence() +SyncScope can be one of `block_scope`, `device_scope`, or `system_scope`. + - `block_scope` orders reads and write on the *same* block. + - `device_scope` orders reads and write on the *same* device. + - `system_scope` orders reads and writes across all threads in the device, + host threads, and all threads in peer devices. -A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the -calling thread and also ensures that no writes to all memory made by the calling thread after -the call to `threadfence()` are observed by any thread in the device as occurring before any -write to all memory made by the calling thread before the call to `threadfence()`. +See [`atomic_thread_fence`](@ref) for a variant that takes atomic orderings. -Note that for this ordering guarantee to be true, the observing threads must truly observe the -memory and not cached versions of it; this is requires the use of volatile loads and stores, -which is not available from Julia right now. -""" -@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) +!!! note + Note that for this ordering guarantee to be true, the observing threads must truly observe the + memory and not cached versions of it; this is requires the use of atomic loads and stores. """ - threadfence_system() +function threadfence end -A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the -calling thread and also ensures that all writes to all memory made by the calling thread -before the call to `threadfence_system()` are observed by all threads in the device, -host threads, and all threads in peer devices as occurring before all writes to all -memory made by the calling thread after the call to `threadfence_system()`. """ -@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) + atomic_thread_fence(order::Atomicx.Ordering, ::SyncScope=device) +""" +function atomic_thread_fence end \ No newline at end of file diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index 29810defe7..6507267b35 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -1,5 +1,5 @@ # TODO: unify with Base.@atomic -using CUDA: @atomic +using CUDA: @atomic, @atomicswap, @atomicreplace using BFloat16s: BFloat16 @testset "atomics (low-level)" begin From 871ea08ea7f24f883fedc1e2f4994f38e781ff1e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 15:51:06 -0500 Subject: [PATCH 03/32] Use cas --- src/device/intrinsics/atomics.jl | 122 +++++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 30 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 2021529f14..c4f9d47f40 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -485,6 +485,68 @@ end __store_volatile!(ptr, val) end end +, +for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (BlockScope, DeviceScope, SystemScope)) + asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" + asm_b32 = "atom.cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" + @eval @inline __cas_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} = + @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) + @eval @inline __cas_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} = + @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) +end + +function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T + if sizeof(T) == 4 + __cas_32!(ptr, old, new, order, scope) + elseif sizeof(T) == 8 + __cas_64!(ptr, old, new, order, scope) + else + assert(false) + end +end + +for scope in (Block, Device, System) + asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" + asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" + @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T = + @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) + @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T = + @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) +end + +function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T + if sizeof(T) == 4 + __cas__volatile_32!(ptr, old, new, scope) + elseif sizeof(T) == 8 + __cas__volatile_64!(ptr, old, new, scope) + else + assert(false) + end +end + +function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::System=System()) where T + order = stronger_order(success_order, failure_order) + if compute_capability() >= sv"7.0" + if order == seq_cst + atomic_thread_fence(seq_cst, scope) + end + if order == seq_cst # order == consume + order = Acquire() + end + val = __cas!(ptr, old, new, order, scope) + else + if order == seq_cst || order == acq_rel || order == release + atomic_thread_fence(seq_cst, scope) + end + val = __cas_volatile!(ptr, old, new, scope) + if order == seq_cst || order == acq_rel || order == acquire # order == consume + atomic_thread_fence(seq_cst, scope) + end + end + success = val == old + return (; old, success) +end # # High-level interface @@ -508,44 +570,44 @@ end atomic_store!(Atomix.pointer(ref), v, order) end -@inline function Atomix.replace!( - ref::CuIndexableRef, - expected, - desired, - success_ordering, - failure_ordering, -) - # TODO success_ordering and failure +@inline function Atomix.replace!(ref::CuIndexableRef,expected,desired, + success_ordering,failure_ordering) ptr = Atomix.pointer(ref) expected = convert(eltype(ref), expected) desired = convert(eltype(ref), desired) - begin - old = atomic_cas!(ptr, expected, desired) + return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering) +end + +@inline modify!(ptr, op::OP, x, order) where {OP} + success = false + while !success + expected = atomic_load(ptr, order) + new = op(expected, new) + old, succss = atomic_cas!(ptr, old, new, order, relaxed) end - return (; old = old, success = old === expected) + return old => new end @inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP} x = convert(eltype(ref), x) ptr = Atomix.pointer(ref) - begin - old = if op === (+) - atomic_add!(ptr, x) - elseif op === (-) - atomic_sub!(ptr, x) - elseif op === (&) - atomic_and!(ptr, x) - elseif op === (|) - atomic_or!(ptr, x) - elseif op === xor - atomic_xor!(ptr, x) - elseif op === min - atomic_min!(ptr, x) - elseif op === max - atomic_max!(ptr, x) - else - error("not implemented") - end - end + # TODO: Support hardware variants + # old = if op === (+) + # atomic_add!(ptr, x) + # elseif op === (-) + # atomic_sub!(ptr, x) + # elseif op === (&) + # atomic_and!(ptr, x) + # elseif op === (|) + # atomic_or!(ptr, x) + # elseif op === xor + # atomic_xor!(ptr, x) + # elseif op === min + # atomic_min!(ptr, x) + # elseif op === max + # atomic_max!(ptr, x) + # else + modify!(ptr, op, x, ord) + # end return old => op(old, x) end From 67c2bcd57793fc0bab1ddff60e1696c3d2d9cf2a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 16:12:53 -0500 Subject: [PATCH 04/32] fixup! Use cas --- src/device/intrinsics/atomics.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index c4f9d47f40..fbcb2106c2 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -485,7 +485,7 @@ end __store_volatile!(ptr, val) end end -, + for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), (BlockScope, DeviceScope, SystemScope)) asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" @@ -506,12 +506,12 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T end end -for scope in (Block, Device, System) +for scope in (BlockScope, DeviceScope, SystemScope) asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" - @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T = + @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) - @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T = + @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) end @@ -525,7 +525,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T end end -function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::System=System()) where T +function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) if compute_capability() >= sv"7.0" if order == seq_cst @@ -578,7 +578,7 @@ end return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering) end -@inline modify!(ptr, op::OP, x, order) where {OP} +@inline function modify!(ptr, op::OP, x, order) where {OP} success = false while !success expected = atomic_load(ptr, order) From c00d65b0862000144e9270e86302665b23ca82ee Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 17:13:43 -0500 Subject: [PATCH 05/32] Support load and store of Int8, Int16 --- src/device/intrinsics/atomics.jl | 42 +++++++++++++++---------------- test/device/intrinsics/atomics.jl | 17 ++----------- 2 files changed, 23 insertions(+), 36 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index fbcb2106c2..1a26239a7b 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -386,17 +386,18 @@ end end end -# Could be done using LLVM. -@inline __load_volatile_64(ptr::LLVMPtr{T, AS}) where {T, AS} = - @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) -@inline __load_volatile_32(ptr::LLVMPtr{T, AS}) where {T, AS} = - @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) - -@inline function __load_volatile(ptr::LLVMPtr{T}) where T - if sizeof(T) == 4 - __load_volatile_32(ptr) +__supports_atomic(::Type{T}) where T = sizeof(T) == 2 || sizeof(T) ==4 + +# Could be done using LLVM +@inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS} + if sizeof(T) == 1 + @asmcall("ld.volatile.b8 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + elseif sizeof(T) == 2 + @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + elseif sizeof(T) == 4 + @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) elseif sizeof(T) == 8 - __load_volatile_64(ptr) + @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) else assert(false) end @@ -406,7 +407,7 @@ end if order == acq_rel || order == release assert(false) end - if compute_capability() >= sv"7.0" + if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == monotonic val = __load(ptr, monotonic, scope) return val @@ -450,16 +451,15 @@ end end # Could be done using LLVM. -@inline __store_volatile_32!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} = - @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) -@inline __store_volatile_64!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} = - @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) - -@inline function __store_volatile!(ptr::LLVMPtr{T}, val::T) where T - if sizeof(T) == 4 - __store_volatile_32!(ptr, val) +@inline function __store_volatile!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} + if sizeof(T) == 1 + @asmcall("st.volatile.b8 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) + elseif sizeof(T) == 2 + @asmcall("st.volatile.b16 [\$0], \$1;", "l,h,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) + elseif sizeof(T) == 4 + @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) elseif sizeof(T) == 8 - __store_volatile_64!(ptr, val) + @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) else assert(false) end @@ -469,7 +469,7 @@ end if order == acq_rel || order == acquire # || order == consume assert(false) end - if compute_capability() >= sv"7.0" + if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == release __store!(ptr, val, release, scope) return diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index 6507267b35..f31fdd8007 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -417,24 +417,11 @@ end @test isnan(Array(a)[1]) end - using CUDA: AtomicError - - @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin - @atomic a[1] = 1 - end - @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin - @atomic a[1] = b ? 1 : 2 - end - - @test_throws_macro AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side") @macroexpand begin - @atomic a[1] = a[2] + 1 - end - - @test_throws_macro AtomicError("unknown @atomic expression") @macroexpand begin + @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin @atomic wat(a[1]) end - @test_throws_macro AtomicError("@atomic should be applied to an array reference expression") @macroexpand begin + @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin @atomic a = a + 1 end end From df395755487d9b30e74037dd2c94d81c76f83f82 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 17:32:43 -0500 Subject: [PATCH 06/32] be less stupid --- src/device/intrinsics/atomics.jl | 19 ++++++++++--------- src/device/intrinsics/synchronization.jl | 12 ++++++------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 1a26239a7b..0bbe1ce974 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -382,11 +382,11 @@ end elseif sizeof(T) == 8 __load_64(ptr, order, scope) else - assert(false) + @assert(false) end end -__supports_atomic(::Type{T}) where T = sizeof(T) == 2 || sizeof(T) ==4 +__supports_atomic(::Type{T}) where T = sizeof(T) == 4 || sizeof(T) == 8 # Could be done using LLVM @inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS} @@ -399,13 +399,13 @@ __supports_atomic(::Type{T}) where T = sizeof(T) == 2 || sizeof(T) ==4 elseif sizeof(T) == 8 @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) else - assert(false) + @assert(false) end end @inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == release - assert(false) + @assert(false) end if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == monotonic @@ -446,7 +446,7 @@ end elseif sizeof(T) == 8 __store_64!(ptr, val, order, scope) else - assert(false) + @assert(false) end end @@ -461,13 +461,13 @@ end elseif sizeof(T) == 8 @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) else - assert(false) + @assert(false) end end @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == acquire # || order == consume - assert(false) + @assert(false) end if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == release @@ -502,7 +502,7 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T elseif sizeof(T) == 8 __cas_64!(ptr, old, new, order, scope) else - assert(false) + @assert(false) end end @@ -521,7 +521,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T elseif sizeof(T) == 8 __cas__volatile_64!(ptr, old, new, scope) else - assert(false) + @assert(false) end end @@ -567,6 +567,7 @@ const CuIndexableRef{Indexable<:CuDeviceArray} = IndexableRef{Indexable} end @inline function Atomix.set!(ref::CuIndexableRef, v, order) + v = convert(eltype(ref), v) atomic_store!(Atomix.pointer(ref), v, order) end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index 1eb881db65..14fe2dbb49 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -95,7 +95,7 @@ function atomic_thread_fence(order, scope::BlockScope) elseif order == acquire || order == acq_rel || order == release # || order == consume threadfence_acq_rel_block() else - assert(false) + @assert(false) end else if order == seq_cst || @@ -106,7 +106,7 @@ function atomic_thread_fence(order, scope::BlockScope) threadfence_block() else - assert(false) + @assert(false) end end end @@ -128,7 +128,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope) threadfence_acq_rel_device() else - assert(false) + @assert(false) end else if order == seq_cst() || @@ -139,7 +139,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope) threadfence_device() else - assert(false) + @assert(false) end end end @@ -161,7 +161,7 @@ function atomic_thread_fence(order, scope::SystemScope) threadfence_acq_rel_system() else - assert(false) + @assert(false) end else if order == seq_cst || @@ -172,7 +172,7 @@ function atomic_thread_fence(order, scope::SystemScope) threadfence_system() else - assert(false) + @assert(false) end end end From 6d58044b98cc15034f79788f949ebe02e795c80d Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 17:47:05 -0500 Subject: [PATCH 07/32] fixup! be less stupid --- src/device/intrinsics/atomics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 0bbe1ce974..0daf94fa42 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -608,7 +608,7 @@ end # elseif op === max # atomic_max!(ptr, x) # else - modify!(ptr, op, x, ord) + modify!(ptr, op, x, order) # end return old => op(old, x) end From ce4482d3014528fa8afe6c02395fe42a1f2e0142 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 18:14:46 -0500 Subject: [PATCH 08/32] fix modify! implementation --- src/device/intrinsics/atomics.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 0daf94fa42..dd548dd1ac 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -525,7 +525,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T end end -function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T +function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) if compute_capability() >= sv"7.0" if order == seq_cst @@ -534,17 +534,17 @@ function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_ord if order == seq_cst # order == consume order = Acquire() end - val = __cas!(ptr, old, new, order, scope) + old = __cas!(ptr, expected, new, order, scope) else if order == seq_cst || order == acq_rel || order == release atomic_thread_fence(seq_cst, scope) end - val = __cas_volatile!(ptr, old, new, scope) + old = __cas_volatile!(ptr, expected, new, scope) if order == seq_cst || order == acq_rel || order == acquire # order == consume atomic_thread_fence(seq_cst, scope) end end - success = val == old + success = expected == old return (; old, success) end @@ -581,12 +581,12 @@ end @inline function modify!(ptr, op::OP, x, order) where {OP} success = false + expected = atomic_load(ptr, order) while !success - expected = atomic_load(ptr, order) - new = op(expected, new) - old, succss = atomic_cas!(ptr, old, new, order, relaxed) + new = op(expected, x) + expected, success = atomic_cas!(ptr, expected, new, order, relaxed) end - return old => new + return expected => new end @inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP} From b4fd958093ec0c58b9d13c37f51f3f61e7776c0e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 18:25:54 -0500 Subject: [PATCH 09/32] fixup! fix modify! implementation --- src/device/intrinsics/atomics.jl | 33 ++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index dd548dd1ac..c811c31581 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -486,6 +486,34 @@ end end end +order(::LLVMOrdering{:monotonic}) = 1 +# order(::Consume) = 2 +order(::LLVMOrdering{:acquire}) = 3 +order(::LLVMOrdering{:release}) = 4 +order(::LLVMOrdering{:acq_rel}) = 5 +order(::LLVMOrdering{:seq_cst}) = 6 + +Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b)) + +function stronger_order(a::LLVMOrdering, b::LLVMOrdering) + m = max(a, b) + if m != release + return m + end + # maximum is release, what is the other one? + other = min(a, b) + if other == monotonic + return release + # elseif other == Consume() + # return Acq_Rel() + elseif other == acquire + return acq_rel + elseif other == release + return release + end + @assert(false) +end + for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), (BlockScope, DeviceScope, SystemScope)) asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" @@ -506,6 +534,7 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T end end +# TODO: Volatile cas for 16/8 for scope in (BlockScope, DeviceScope, SystemScope) asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" @@ -532,7 +561,7 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur atomic_thread_fence(seq_cst, scope) end if order == seq_cst # order == consume - order = Acquire() + order = acquire end old = __cas!(ptr, expected, new, order, scope) else @@ -584,7 +613,7 @@ end expected = atomic_load(ptr, order) while !success new = op(expected, x) - expected, success = atomic_cas!(ptr, expected, new, order, relaxed) + expected, success = atomic_cas!(ptr, expected, new, order, monotonic) end return expected => new end From e22f8f6b2752a594b5dc080cb12a1826e4d37ed3 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 21:20:05 -0500 Subject: [PATCH 10/32] fix atomic usage in linalg.jl --- lib/cublas/linalg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cublas/linalg.jl b/lib/cublas/linalg.jl index ead34d4262..5779bc7c22 100644 --- a/lib/cublas/linalg.jl +++ b/lib/cublas/linalg.jl @@ -44,7 +44,7 @@ function LinearAlgebra.dot(x::AnyCuArray{T1}, y::AnyCuArray{T2}) where {T1,T2} val = CUDA.reduce_block(+, local_val, zero(T), shuffle) if threadIdx().x == 1i32 # NOTE: introduces nondeterminism - @inbounds CUDA.@atomic res[] += val + @inbounds CUDA.@atomic res[1i32] += val end return From ecef6921c7bcf58b913d20bb464506797b13fb13 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 21:26:48 -0500 Subject: [PATCH 11/32] fixup! fix atomic usage in linalg.jl --- src/device/intrinsics/atomics.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index c811c31581..13304bfc25 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -371,9 +371,9 @@ for (order, scope) in Iterators.product((LLVMOrdering{:acquire}, LLVMOrdering{:m asm_b64 = "ld.$(asm(order)).$(asm(scope)).b64 \$0, [\$1];" asm_b32 = "ld.$(asm(order)).$(asm(scope)).b32 \$0, [\$1];" @eval @inline __load_64(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr) + @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) @eval @inline __load_32(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr) + @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) end @inline function __load(ptr::LLVMPtr{T}, order, scope) where T @@ -600,8 +600,8 @@ end atomic_store!(Atomix.pointer(ref), v, order) end -@inline function Atomix.replace!(ref::CuIndexableRef,expected,desired, - success_ordering,failure_ordering) +@inline function Atomix.replace!(ref::CuIndexableRef, expected, desired, + success_ordering, failure_ordering) ptr = Atomix.pointer(ref) expected = convert(eltype(ref), expected) desired = convert(eltype(ref), desired) From 7ae6ccaa2ff6fb5e6b50f67b2847a98c1fc09024 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 21:41:05 -0500 Subject: [PATCH 12/32] add error for SM_60 --- src/device/intrinsics/atomics.jl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 13304bfc25..ae4dd6181a 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -417,7 +417,7 @@ end end val = __load(ptr, acquire, scope) return val - else + elseif compute_capability() >= sv"6.0" if order == seq_cst atomic_thread_fence(seq_cst, scope) end @@ -427,6 +427,8 @@ end end atomic_thread_fence(order, scope) return val + else + error("Atomics are only supported on SM_60") end end @@ -478,11 +480,13 @@ end atomic_thread_fence(seq_cst, scope) end __store!(ptr, val, monotonic, scope) - else + elseif compute_capability() >= sv"6.0" if order == seq_cst atomic_thread_fence(seq_cst, scope) end __store_volatile!(ptr, val) + else + error("Atomics are only supported on SM_60") end end @@ -556,7 +560,7 @@ end function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) - if compute_capability() >= sv"7.0" + if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == seq_cst atomic_thread_fence(seq_cst, scope) end @@ -564,7 +568,7 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur order = acquire end old = __cas!(ptr, expected, new, order, scope) - else + elseif compute_capability() >= sv"6.0" if order == seq_cst || order == acq_rel || order == release atomic_thread_fence(seq_cst, scope) end @@ -572,6 +576,8 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur if order == seq_cst || order == acq_rel || order == acquire # order == consume atomic_thread_fence(seq_cst, scope) end + else + error("Atomics are only supported on SM_60") end success = expected == old return (; old, success) From 80c6ab6049790f473b2d082a19c25586e0482a6b Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 21:46:39 -0500 Subject: [PATCH 13/32] Fixup modify --- src/device/intrinsics/atomics.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index ae4dd6181a..c123e324d1 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -550,9 +550,9 @@ end function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T if sizeof(T) == 4 - __cas__volatile_32!(ptr, old, new, scope) + __cas_volatile_32!(ptr, old, new, scope) elseif sizeof(T) == 8 - __cas__volatile_64!(ptr, old, new, scope) + __cas_volatile_64!(ptr, old, new, scope) else @assert(false) end @@ -614,9 +614,10 @@ end return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering) end -@inline function modify!(ptr, op::OP, x, order) where {OP} +@inline function modify!(ptr::LLVMPtr{T}, op::OP, x, order) where {T, OP} success = false expected = atomic_load(ptr, order) + local new::T while !success new = op(expected, x) expected, success = atomic_cas!(ptr, expected, new, order, monotonic) @@ -643,7 +644,7 @@ end # elseif op === max # atomic_max!(ptr, x) # else - modify!(ptr, op, x, order) + return modify!(ptr, op, x, order) # end - return old => op(old, x) + # return old => op(old, x) end From aa6258688eb78496af97cdb7445f45bc33e23879 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 10 Mar 2023 22:11:57 -0500 Subject: [PATCH 14/32] skip shmem for now --- test/device/intrinsics/atomics.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index f31fdd8007..dae88ccdd7 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -193,15 +193,15 @@ end end end -@testset "shared memory" begin - function kernel() - shared = CuStaticSharedArray(Float32, 1) - @atomic shared[threadIdx().x] += 0f0 - return - end - - CUDA.@sync @cuda kernel() -end +# @testset "shared memory" begin +# function kernel() +# shared = CuStaticSharedArray(Float32, 1) +# @atomic shared[threadIdx().x] += 0f0 +# return +# end + +# CUDA.@sync @cuda kernel() +# end end From 6325ee9ee33f97e33a4800a6377fa3ca8a133a69 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 11 Mar 2023 09:55:55 -0500 Subject: [PATCH 15/32] fix volatile load --- src/device/intrinsics.jl | 6 +++++ src/device/intrinsics/atomics.jl | 30 ++++++++++++++---------- src/device/intrinsics/synchronization.jl | 12 +++++----- test/device/intrinsics/atomics.jl | 25 ++++++++++++++++++++ 4 files changed, 54 insertions(+), 19 deletions(-) diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl index 30e797cbd7..1bf3887b62 100644 --- a/src/device/intrinsics.jl +++ b/src/device/intrinsics.jl @@ -15,6 +15,12 @@ const block_scope = BlockScope() import UnsafeAtomics using UnsafeAtomics.Internal: LLVMOrdering using UnsafeAtomics: unordered, monotonic, acquire, release, acq_rel, seq_cst + +struct AtomicUnsupported{T} <: Exception end +struct AtomicOrderUnsupported{Ordering} <: Exception + order::Ordering +end + # Note CUDA C++ has also consume ordering which LLVM does not support # monotonic -> relaxed # unordered -> ??? maybe weak diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index c123e324d1..ff3ea6a49d 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -382,30 +382,33 @@ end elseif sizeof(T) == 8 __load_64(ptr, order, scope) else - @assert(false) + throw(AtomicUnsupported{T}()) end end __supports_atomic(::Type{T}) where T = sizeof(T) == 4 || sizeof(T) == 8 -# Could be done using LLVM +# Could be done using LLVM +# TODO: Register choice for Float32/Float64 @inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS} if sizeof(T) == 1 - @asmcall("ld.volatile.b8 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + val = @asmcall("ld.volatile.b8 \$0, [\$1];", "=r,l,~{memory}", true, UInt32, Tuple{LLVMPtr{T, AS}}, ptr) + return val % T elseif sizeof(T) == 2 - @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + val = @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, UInt16, Tuple{LLVMPtr{T, AS}}, ptr) + return Core.bitcast(T, val) # Float16 otherwise complaints elseif sizeof(T) == 4 - @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) elseif sizeof(T) == 8 - @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}) + @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) else - @assert(false) + throw(AtomicUnsupported{T}()) end end @inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == release - @assert(false) + throw(AtomicOrderUnsupported(order)) end if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == monotonic @@ -448,7 +451,7 @@ end elseif sizeof(T) == 8 __store_64!(ptr, val, order, scope) else - @assert(false) + throw(AtomicUnsupported{T}()) end end @@ -463,13 +466,13 @@ end elseif sizeof(T) == 8 @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) else - @assert(false) + throw(AtomicUnsupported{T}()) end end @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == acquire # || order == consume - @assert(false) + throw(AtomicOrderUnsupported(order)) end if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == release @@ -515,6 +518,7 @@ function stronger_order(a::LLVMOrdering, b::LLVMOrdering) elseif other == release return release end + Base.llvmcall("unreachable", Cvoid, Tuple{}) @assert(false) end @@ -534,7 +538,7 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T elseif sizeof(T) == 8 __cas_64!(ptr, old, new, order, scope) else - @assert(false) + throw(AtomicUnsupported{T}()) end end @@ -554,7 +558,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T elseif sizeof(T) == 8 __cas_volatile_64!(ptr, old, new, scope) else - @assert(false) + throw(AtomicUnsupported{T}()) end end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index 14fe2dbb49..e98baa16a2 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -95,7 +95,7 @@ function atomic_thread_fence(order, scope::BlockScope) elseif order == acquire || order == acq_rel || order == release # || order == consume threadfence_acq_rel_block() else - @assert(false) + throw(AtomicOrderUnsupported(order)) end else if order == seq_cst || @@ -106,7 +106,7 @@ function atomic_thread_fence(order, scope::BlockScope) threadfence_block() else - @assert(false) + throw(AtomicOrderUnsupported(order)) end end end @@ -128,7 +128,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope) threadfence_acq_rel_device() else - @assert(false) + throw(AtomicOrderUnsupported(order)) end else if order == seq_cst() || @@ -139,7 +139,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope) threadfence_device() else - @assert(false) + throw(AtomicOrderUnsupported(order)) end end end @@ -161,7 +161,7 @@ function atomic_thread_fence(order, scope::SystemScope) threadfence_acq_rel_system() else - @assert(false) + throw(AtomicOrderUnsupported(order)) end else if order == seq_cst || @@ -172,7 +172,7 @@ function atomic_thread_fence(order, scope::SystemScope) threadfence_system() else - @assert(false) + throw(AtomicOrderUnsupported(order)) end end end diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index dae88ccdd7..178489fd31 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -2,6 +2,31 @@ using CUDA: @atomic, @atomicswap, @atomicreplace using BFloat16s: BFloat16 +@testset "atomics (low-level) with order" begin + +@testset "atomic_load" begin + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + # TODO unordered + supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] + unsupported_orders = [CUDA.release, CUDA.acq_rel] + + function kernel(a, order, scope) + CUDA.atomic_load(pointer(a), order, scope) + return + end + + for (T, order, scope) in Iterators.product(types, supported_orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, order, scope) + end + end +end +end # atomics (low-level) with order + @testset "atomics (low-level)" begin # tested on all natively-supported atomics From c7c6f3341e670279857709a545a23a44ba48d7fc Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 11 Mar 2023 18:28:18 -0500 Subject: [PATCH 16/32] add more low-level tests --- src/device/intrinsics/atomics.jl | 1 - test/device/intrinsics/atomics.jl | 45 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index ff3ea6a49d..74a97eda9f 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -542,7 +542,6 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T end end -# TODO: Volatile cas for 16/8 for scope in (BlockScope, DeviceScope, SystemScope) asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index 178489fd31..655eb81de0 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -25,6 +25,51 @@ using BFloat16s: BFloat16 end end end + +@testset "atomic_store!" begin + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + # TODO unordered + supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] + unsupported_orders = [CUDA.acquire, CUDA.acq_rel] + + function kernel(a, val, order, scope) + CUDA.atomic_store!(pointer(a), val, order, scope) + return + end + + for (T, order, scope) in Iterators.product(types, supported_orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, one(T), order, scope) + end + end +end + +@testset "atomic_cas!" begin + if capability(device()) >= v"6.0" + # TODO size(T) in (1, 2) + types = [Int32, Int64, + UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + # TODO unordered + orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel] + + function kernel(a, expected, desired, success_order, failure_order, scope) + CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope) + return + end + + for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope) + end + end +end + end # atomics (low-level) with order @testset "atomics (low-level)" begin From 6192f44e75576c8be4ea1dc2822450a95fe8ab12 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 11 Mar 2023 19:55:00 -0500 Subject: [PATCH 17/32] cleanup and use egal --- src/device/intrinsics/atomics.jl | 14 +- test/device/intrinsics/atomics.jl | 573 +++++---------------- test/device/intrinsics/lowlevel_atomics.jl | 308 +++++++++++ 3 files changed, 432 insertions(+), 463 deletions(-) create mode 100644 test/device/intrinsics/lowlevel_atomics.jl diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 74a97eda9f..d2e05c1f8c 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -502,7 +502,7 @@ order(::LLVMOrdering{:seq_cst}) = 6 Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b)) -function stronger_order(a::LLVMOrdering, b::LLVMOrdering) +@inline function stronger_order(a::LLVMOrdering, b::LLVMOrdering) m = max(a, b) if m != release return m @@ -532,7 +532,7 @@ for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:a @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) end -function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T +@inline function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T if sizeof(T) == 4 __cas_32!(ptr, old, new, order, scope) elseif sizeof(T) == 8 @@ -545,13 +545,13 @@ end for scope in (BlockScope, DeviceScope, SystemScope) asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" - @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = + @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) - @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = + @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) end -function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T +@inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T if sizeof(T) == 4 __cas_volatile_32!(ptr, old, new, scope) elseif sizeof(T) == 8 @@ -561,7 +561,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T end end -function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T +@inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) if compute_capability() >= sv"7.0" && __supports_atomic(T) if order == seq_cst @@ -582,7 +582,7 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur else error("Atomics are only supported on SM_60") end - success = expected == old + success = expected === old # egal since otherwise NaN's won't work. return (; old, success) end diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl index 655eb81de0..da331c0474 100644 --- a/test/device/intrinsics/atomics.jl +++ b/test/device/intrinsics/atomics.jl @@ -1,480 +1,182 @@ -# TODO: unify with Base.@atomic using CUDA: @atomic, @atomicswap, @atomicreplace -using BFloat16s: BFloat16 - -@testset "atomics (low-level) with order" begin - -@testset "atomic_load" begin - if capability(device()) >= v"6.0" - types = [Int8, Int16, Int32, Int64, - UInt8, UInt16, UInt32, UInt64, - Float64, Float32] - scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] - # TODO unordered - supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] - unsupported_orders = [CUDA.release, CUDA.acq_rel] - - function kernel(a, order, scope) - CUDA.atomic_load(pointer(a), order, scope) - return - end - - for (T, order, scope) in Iterators.product(types, supported_orders, scopes) - a = CuArray(T[0]) - @cuda threads=1 kernel(a, order, scope) - end - end -end - -@testset "atomic_store!" begin - if capability(device()) >= v"6.0" - types = [Int8, Int16, Int32, Int64, - UInt8, UInt16, UInt32, UInt64, - Float64, Float32] - scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] - # TODO unordered - supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] - unsupported_orders = [CUDA.acquire, CUDA.acq_rel] - - function kernel(a, val, order, scope) - CUDA.atomic_store!(pointer(a), val, order, scope) - return - end - - for (T, order, scope) in Iterators.product(types, supported_orders, scopes) - a = CuArray(T[0]) - @cuda threads=1 kernel(a, one(T), order, scope) - end - end -end - -@testset "atomic_cas!" begin - if capability(device()) >= v"6.0" - # TODO size(T) in (1, 2) - types = [Int32, Int64, - UInt32, UInt64, - Float64, Float32] - scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] - # TODO unordered - orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel] - - function kernel(a, expected, desired, success_order, failure_order, scope) - CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope) - return - end - - for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes) - a = CuArray(T[0]) - @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope) - end - end -end - -end # atomics (low-level) with order - -@testset "atomics (low-level)" begin - -# tested on all natively-supported atomics - -@testset "atomic_add" begin - types = [Int32, Int64, UInt32, UInt64, Float32] - capability(device()) >= v"6.0" && push!(types, Float64) - capability(device()) >= v"7.0" && push!(types, Float16) - - @testset for T in types - a = CuArray(T[0]) - - function kernel(a, b) - CUDA.atomic_add!(pointer(a), b) - return - end - - @cuda threads=1024 kernel(a, one(T)) - @test Array(a)[1] == 1024 - end -end - -@testset "atomic_sub" begin - types = [Int32, Int64, UInt32, UInt64] - - @testset for T in types - a = CuArray(T[2048]) - - function kernel(a, b) - CUDA.atomic_sub!(pointer(a), b) - return - end - - @cuda threads=1024 kernel(a, one(T)) - @test Array(a)[1] == 1024 - end -end - -@testset "atomic_inc" begin - @testset for T in [Int32] - a = CuArray(T[0]) - - function kernel(a, b) - CUDA.atomic_inc!(pointer(a), b) - return - end - - @cuda threads=768 kernel(a, T(512)) - @test Array(a)[1] == 255 - end -end - -@testset "atomic_dec" begin - @testset for T in [Int32] - a = CuArray(T[1024]) - - function kernel(a, b) - CUDA.atomic_dec!(pointer(a), b) - return - end - - @cuda threads=256 kernel(a, T(512)) - @test Array(a)[1] == 257 - end -end - -@testset "atomic_xchg" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray([zero(T)]) - - function kernel(a, b) - CUDA.atomic_xchg!(pointer(a), b) - return - end - - @cuda threads=1024 kernel(a, one(T)) - @test Array(a)[1] == one(T) - end -end - -@testset "atomic_and" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray(T[1023]) - - function kernel(a, T) - i = threadIdx().x - 1 - k = 1 - for i = 1:i - k *= 2 - end - b = 1023 - k # 1023 - 2^i - CUDA.atomic_and!(pointer(a), T(b)) - return - end - - @cuda threads=10 kernel(a, T) - @test Array(a)[1] == zero(T) - end -end - -@testset "atomic_or" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray(T[0]) - - function kernel(a, T) - i = threadIdx().x - b = 1 # 2^(i-1) - for i = 1:i - b *= 2 - end - b /= 2 - CUDA.atomic_or!(pointer(a), T(b)) - return - end - - @cuda threads=10 kernel(a, T) - @test Array(a)[1] == 1023 - end -end - -@testset "atomic_xor" begin - @testset for T in [Int32, Int64, UInt32, UInt64] - a = CuArray(T[1023]) - - function kernel(a, T) - i = threadIdx().x - b = 1 # 2^(i-1) - for i = 1:i - b *= 2 - end - b /= 2 - CUDA.atomic_xor!(pointer(a), T(b)) - return - end - - @cuda threads=10 kernel(a, T) - @test Array(a)[1] == 0 - end -end - -@testset "atomic_cas" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16]) - - @testset for T in types - a = CuArray(T[0]) - - function kernel(a, b, c) - CUDA.atomic_cas!(pointer(a), b, c) - return - end - - @cuda threads=1024 kernel(a, zero(T), one(T)) - @test Array(a)[1] == 1 - end -end - -@testset "atomic_max" begin - types = [Int32, Int64, UInt32, UInt64] - - @testset for T in types - a = CuArray([zero(T)]) - - function kernel(a, T) - i = threadIdx().x - CUDA.atomic_max!(pointer(a), T(i)) - return - end - - @cuda threads=1024 kernel(a, T) - @test Array(a)[1] == 1024 - end -end - -@testset "atomic_min" begin - types = [Int32, Int64, UInt32, UInt64] - - @testset for T in types - a = CuArray(T[1024]) - - function kernel(a, T) - i = threadIdx().x - CUDA.atomic_min!(pointer(a), T(i)) - return - end - - @cuda threads=1024 kernel(a, T) - @test Array(a)[1] == 1 - end -end - -# @testset "shared memory" begin -# function kernel() -# shared = CuStaticSharedArray(Float32, 1) -# @atomic shared[threadIdx().x] += 0f0 -# return -# end - -# CUDA.@sync @cuda kernel() -# end - -end @testset "atomics (high-level)" begin - -# tested on all types supported by atomic_cas! (which empowers the fallback definition) - -@testset "add" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray([zero(T)]) - + # tested on all types supported by atomic_cas! (which empowers the fallback definition) + + @testset "add" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] + 1 @atomic a[1] += 1 return end - @cuda threads=1024 kernel(T, a) - @test Array(a)[1] == 2048 + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=1024 kernel(T, a) + @test Array(a)[1] == 1024 + end end -end - -@testset "sub" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray(T[2048]) - + + @testset "sub" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] - 1 @atomic a[1] -= 1 return end - @cuda threads=1024 kernel(T, a) - @test Array(a)[1] == 0 + @testset for T in types + a = CuArray(T[2048]) + @cuda threads=1024 kernel(T, a) + @test Array(a)[1] == 1024 + end end -end - -@testset "mul" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray(T[1]) - + + @testset "mul" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] * 2 @atomic a[1] *= 2 return end - @cuda threads=5 kernel(T, a) - @test Array(a)[1] == 1024 + @testset for T in types + a = CuArray(T[1]) + @cuda threads=5 kernel(T, a) + @test Array(a)[1] == 32 + end end -end - -@testset "div" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray(T[1024]) - + + @testset "div" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) - @atomic a[1] = a[1] / 2 @atomic a[1] /= 2 return end - @cuda threads=5 kernel(T, a) - @test Array(a)[1] == 1 + @testset for T in types + a = CuArray(T[32]) + @cuda threads=5 kernel(T, a) + @test Array(a)[1] == 1 + end end -end - -@testset "and" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([~zero(T), ~zero(T)]) - + + @testset "and" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) + function kernel(T, a) i = threadIdx().x mask = ~(T(1) << (i-1)) - @atomic a[1] = a[1] & mask - @atomic a[2] &= mask + @atomic a[1] &= mask return end - - @cuda threads=8*sizeof(T) kernel(T, a) - @test Array(a)[1] == zero(T) - @test Array(a)[2] == zero(T) + + @testset for T in types + a = CuArray([~zero(T)]) + @cuda threads=8*sizeof(T) kernel(T, a) + @test Array(a)[1] == zero(T) + end end -end - -@testset "or" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([zero(T), zero(T)]) + + @testset "or" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) function kernel(T, a) i = threadIdx().x mask = T(1) << (i-1) - @atomic a[1] = a[1] | mask - @atomic a[2] |= mask + @atomic a[1] |= mask return end - @cuda threads=8*sizeof(T) kernel(T, a) - @test Array(a)[1] == ~zero(T) - @test Array(a)[2] == ~zero(T) + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=8*sizeof(T) kernel(T, a) + @test Array(a)[1] == ~zero(T) + end end -end - -@testset "xor" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([zero(T), zero(T)]) - + + @testset "xor" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) + function kernel(T, a) i = threadIdx().x mask = T(1) << ((i-1)%(8*sizeof(T))) - @atomic a[1] = a[1] ⊻ mask - @atomic a[2] ⊻= mask + @atomic a[1] ⊻= mask return end - nb = 4 - @cuda threads=(8*sizeof(T)+nb) kernel(T, a) - @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T)) - @test Array(a)[2] == ~zero(T) & ~((one(T) << nb) - one(T)) + @testset for T in types + a = CuArray([zero(T)]) + nb = 4 + @cuda threads=(8*sizeof(T)+nb) kernel(T, a) + @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T)) + end end -end - -@testset "max" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray([zero(T)]) - + + @testset "max" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) i = threadIdx().x - @atomic a[1] = max(a[1], i) + @atomic a[1] max i return end - @cuda threads=32 kernel(T, a) - @test Array(a)[1] == 32 + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=32 kernel(T, a) + @test Array(a)[1] == 32 + end end -end - -@testset "min" begin - types = [Int32, Int64, UInt32, UInt64, Float32, Float64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) - - @testset for T in types - a = CuArray([typemax(T)]) - + + @testset "min" begin + types = [Int32, Int64, UInt32, UInt64, Float32, Float64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16]) + function kernel(T, a) i = threadIdx().x - @atomic a[1] = min(a[1], i) + @atomic a[1] min i return end - @cuda threads=32 kernel(T, a) - @test Array(a)[1] == 1 + @testset for T in types + a = CuArray([typemax(T)]) + @cuda threads=32 kernel(T, a) + @test Array(a)[1] == 1 + end end -end - -@testset "shift" begin - types = [Int32, Int64, UInt32, UInt64] - capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) - - @testset for T in types - a = CuArray([one(T)]) - + + @testset "shift" begin + types = [Int32, Int64, UInt32, UInt64] + # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16]) + function kernel(T, a) @atomic a[1] <<= 1 return end - @cuda threads=8 kernel(T, a) - @test Array(a)[1] == 1<<8 + @testset for T in types + a = CuArray([one(T)]) + @cuda threads=8 kernel(T, a) + @test Array(a)[1] == 1<<8 + end end -end - -@testset "macro" begin - + @testset "NaN" begin f(x,y) = 3x + 2y function kernel(x) - CUDA.@atomic x[1] = f(x[1],42f0) + @inbounds CUDA.@atomic x[1] f 42f0 nothing end @@ -487,56 +189,15 @@ end @test isnan(Array(a)[1]) end - @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin - @atomic wat(a[1]) - end - - @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin - @atomic a = a + 1 - end -end - -@testset "shared memory" begin - # test that atomic operations on shared memory work - # https://github.com/JuliaGPU/CUDA.jl/issues/311 - - function kernel(a) - b = CUDA.CuStaticSharedArray(Int, 1) - - if threadIdx().x == 1 - b[] = a[] + @testset "macro" begin + @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin + @atomic wat(a[1]) end - sync_threads() - - CUDA.atomic_add!(pointer(b), 1) - sync_threads() - - if threadIdx().x == 1 - a[] = b[] + + @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin + @atomic a = a + 1 end - return end - - a = CuArray([0]) - @cuda threads=16 kernel(a) - @test Array(a) == [16] -end - -@testset "shared memory bug" begin - # shared memory atomics resulted in illegal memory accesses - # https://github.com/JuliaGPU/CUDA.jl/issues/558 - - function kernel() - tid = threadIdx().x - shared = CuStaticSharedArray(Float32, 4) - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - return - end - - @cuda threads=2 kernel() - synchronize() -end - + end + \ No newline at end of file diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl new file mode 100644 index 0000000000..7122cd984f --- /dev/null +++ b/test/device/intrinsics/lowlevel_atomics.jl @@ -0,0 +1,308 @@ +using BFloat16s: BFloat16 + +@testset "atomics (low-level) with order" begin + +@testset "atomic_load" begin + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + # TODO unordered + supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] + unsupported_orders = [CUDA.release, CUDA.acq_rel] + + function kernel(a, order, scope) + CUDA.atomic_load(pointer(a), order, scope) + return + end + + @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, order, scope) + @test Array(a)[1] == 0 + end + end +end + +@testset "atomic_store!" begin + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + # TODO unordered + supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] + unsupported_orders = [CUDA.acquire, CUDA.acq_rel] + + function kernel(a, val, order, scope) + CUDA.atomic_store!(pointer(a), val, order, scope) + return + end + + @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, one(T), order, scope) + @test Array(a)[1] == one(T) + end + end +end + +@testset "atomic_cas!" begin + if capability(device()) >= v"6.0" + # TODO size(T) in (1, 2) + types = [Int32, Int64, + UInt32, UInt64, + Float64, Float32] + scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] + # TODO unordered + orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel] + + function kernel(a, expected, desired, success_order, failure_order, scope) + CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope) + return + end + + @testset for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes) + a = CuArray(T[0]) + @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope) + @test Array(a)[1] == one(T) + end + end +end + +end # atomics (low-level) with order + +@testset "atomics (low-level)" begin + +# tested on all natively-supported atomics + +@testset "atomic_add" begin + types = [Int32, Int64, UInt32, UInt64, Float32] + capability(device()) >= v"6.0" && push!(types, Float64) + capability(device()) >= v"7.0" && push!(types, Float16) + + function kernel(a, b) + CUDA.atomic_add!(pointer(a), b) + return + end + + @testset for T in types + a = CuArray(T[0]) + + @cuda threads=1024 kernel(a, one(T)) + @test Array(a)[1] == 1024 + end +end + +@testset "atomic_sub" begin + types = [Int32, Int64, UInt32, UInt64] + + function kernel(a, b) + CUDA.atomic_sub!(pointer(a), b) + return + end + + @testset for T in types + a = CuArray(T[2048]) + @cuda threads=1024 kernel(a, one(T)) + @test Array(a)[1] == 1024 + end +end + +@testset "atomic_inc" begin + function kernel(a, b) + CUDA.atomic_inc!(pointer(a), b) + return + end + + @testset for T in [Int32] + a = CuArray(T[0]) + @cuda threads=768 kernel(a, T(512)) + @test Array(a)[1] == 255 + end +end + +@testset "atomic_dec" begin + function kernel(a, b) + CUDA.atomic_dec!(pointer(a), b) + return + end + + @testset for T in [Int32] + a = CuArray(T[1024]) + @cuda threads=256 kernel(a, T(512)) + @test Array(a)[1] == 257 + end +end + +@testset "atomic_xchg" begin + function kernel(a, b) + CUDA.atomic_xchg!(pointer(a), b) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray([zero(T)]) + @cuda threads=1024 kernel(a, one(T)) + @test Array(a)[1] == one(T) + end +end + +@testset "atomic_and" begin + function kernel(a, T) + i = threadIdx().x - 1 + k = 1 + for i = 1:i + k *= 2 + end + b = 1023 - k # 1023 - 2^i + CUDA.atomic_and!(pointer(a), T(b)) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray(T[1023]) + @cuda threads=10 kernel(a, T) + @test Array(a)[1] == zero(T) + end +end + +@testset "atomic_or" begin + function kernel(a, T) + i = threadIdx().x + b = 1 # 2^(i-1) + for i = 1:i + b *= 2 + end + b /= 2 + CUDA.atomic_or!(pointer(a), T(b)) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray(T[0]) + @cuda threads=10 kernel(a, T) + @test Array(a)[1] == 1023 + end +end + +@testset "atomic_xor" begin + function kernel(a, T) + i = threadIdx().x + b = 1 # 2^(i-1) + for i = 1:i + b *= 2 + end + b /= 2 + CUDA.atomic_xor!(pointer(a), T(b)) + return + end + @testset for T in [Int32, Int64, UInt32, UInt64] + a = CuArray(T[1023]) + @cuda threads=10 kernel(a, T) + @test Array(a)[1] == 0 + end +end + +@testset "atomic_cas" begin + types = [Int32, Int64, UInt32, UInt64] + capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16]) + + function kernel(a, b, c) + CUDA.atomic_cas!(pointer(a), b, c) + return + end + + @testset for T in types + a = CuArray(T[0]) + @cuda threads=1024 kernel(a, zero(T), one(T)) + @test Array(a)[1] == 1 + end +end + +@testset "atomic_max" begin + types = [Int32, Int64, UInt32, UInt64] + + function kernel(a, T) + i = threadIdx().x + CUDA.atomic_max!(pointer(a), T(i)) + return + end + + @testset for T in types + a = CuArray([zero(T)]) + @cuda threads=1024 kernel(a, T) + @test Array(a)[1] == 1024 + end +end + +@testset "atomic_min" begin + types = [Int32, Int64, UInt32, UInt64] + + function kernel(a, T) + i = threadIdx().x + CUDA.atomic_min!(pointer(a), T(i)) + return + end + + @testset for T in types + a = CuArray(T[1024]) + @cuda threads=1024 kernel(a, T) + @test Array(a)[1] == 1 + end +end + +@testset "shared memory" begin + @testset "simple" begin + function kernel() + shared = CuStaticSharedArray(Float32, 1) + CUDA.atomic_add!(pointer(shared, threadIdx().x), 0f0) + return + end + + CUDA.@sync @cuda kernel() + end + + @testset "shared memory reduction" begin + # test that atomic operations on shared memory work + # https://github.com/JuliaGPU/CUDA.jl/issues/311 + + function kernel(a) + b = CUDA.CuStaticSharedArray(Int, 1) + + if threadIdx().x == 1 + b[] = a[] + end + sync_threads() + + CUDA.atomic_add!(pointer(b), 1) + sync_threads() + + if threadIdx().x == 1 + a[] = b[] + end + return + end + + a = CuArray([0]) + @cuda threads=16 kernel(a) + @test Array(a) == [16] + end + + @testset "shared memory bug" begin + # shared memory atomics resulted in illegal memory accesses + # https://github.com/JuliaGPU/CUDA.jl/issues/558 + + function kernel() + tid = threadIdx().x + shared = CuStaticSharedArray(Float32, 4) + CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) + sync_threads() + CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) + return + end + + @cuda threads=2 kernel() + synchronize() + end +end + +end # low-level atomics From 7437924ecf478aa03277547a8bc7b3c0736a7df8 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 11 Mar 2023 21:42:22 -0500 Subject: [PATCH 18/32] add fallback for < sm_60 --- src/device/intrinsics/atomics.jl | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index d2e05c1f8c..423c7af317 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -431,7 +431,16 @@ end atomic_thread_fence(order, scope) return val else - error("Atomics are only supported on SM_60") + # Fallback to threadfence w/o order + load_volatile + if order == seq_cst + threadfence(scope) + end + val = __load_volatile(ptr) + if order == monotonic + return val + end + threadfence(scope) + return val end end @@ -489,7 +498,11 @@ end end __store_volatile!(ptr, val) else - error("Atomics are only supported on SM_60") + # Fallback to threadfence w/o order + store_volatile + if order == seq_cst + threadfence(scope) + end + __store_volatile!(ptr, val) end end @@ -580,7 +593,15 @@ end atomic_thread_fence(seq_cst, scope) end else - error("Atomics are only supported on SM_60") + # Fallback to atomic_cas w/o scope on pre SM_60 + if order == seq_cst || order == acq_rel || order == release + threadfence(scope) + end + val = atomic_cas!(ptr, expected, new) + if order == seq_cst || order == acq_rel || order == acquire # order == consume + threadfence(scope) + end + return val end success = expected === old # egal since otherwise NaN's won't work. return (; old, success) From 5d63f5f1d883f5898cb685f16c9a04bcf1445961 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 11 Mar 2023 22:05:24 -0500 Subject: [PATCH 19/32] add __cas_volatile_16 and global/shared --- src/device/intrinsics/atomics.jl | 54 ++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 423c7af317..55da7ae0c5 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -535,14 +535,22 @@ Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b)) @assert(false) end -for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), - (BlockScope, DeviceScope, SystemScope)) - asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" - asm_b32 = "atom.cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" - @eval @inline __cas_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) - @eval @inline __cas_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) +for (order, scope, A) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared)) + if A == AS.Global + as = ".global" + elseif A == AS.Shared + as = ".shared" + else + as = "" + end + asm_b64 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" + asm_b32 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" + @eval @inline __cas_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = + @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + @eval @inline __cas_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = + @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end @inline function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T @@ -555,17 +563,31 @@ end end end -for scope in (BlockScope, DeviceScope, SystemScope) - asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" - asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" - @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = - @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) - @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} = - @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new) +for (scope, A) in Iterators.product((BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared)) + if A == AS.Global + as = ".global" + elseif A == AS.Shared + as = ".shared" + else + as = "" + end + + asm_b64 = "atom$(as).cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" + asm_b32 = "atom$(as).cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" + asm_b16 = "atom$(as).cas.$(asm(scope)).b16 \$0,[\$1],\$2,\$3;" + @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = + @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = + @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + @eval @inline __cas_volatile_16!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = + @asmcall($asm_b32, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end @inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T - if sizeof(T) == 4 + if sizeof(T) == 2 + __cas_volatile_16!(ptr, old, new, scope) + elseif sizeof(T) == 4 __cas_volatile_32!(ptr, old, new, scope) elseif sizeof(T) == 8 __cas_volatile_64!(ptr, old, new, scope) From 450808bb4f16edaa8e847f6ee3263dd605a7fb3d Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 12 Mar 2023 01:29:33 -0500 Subject: [PATCH 20/32] no I am not losing my mind --- src/device/intrinsics/atomics.jl | 166 +++++++++++++++++++------------ 1 file changed, 101 insertions(+), 65 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 55da7ae0c5..57627efef4 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -366,51 +366,94 @@ asm(::Type{SystemScope}) = :sys asm(::Type{DeviceScope}) = :gpu asm(::Type{BlockScope}) = :cta -for (order, scope) in Iterators.product((LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), - (BlockScope, DeviceScope, SystemScope)) - asm_b64 = "ld.$(asm(order)).$(asm(scope)).b64 \$0, [\$1];" - asm_b32 = "ld.$(asm(order)).$(asm(scope)).b32 \$0, [\$1];" - @eval @inline __load_64(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) - @eval @inline __load_32(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) +function suffix(sz) + if sz == 1 + "b8" + elseif sz == 2 + "b16" + elseif sz == 4 + "b32" + elseif sz == 8 + "b64" + end end -@inline function __load(ptr::LLVMPtr{T}, order, scope) where T - if sizeof(T) == 4 - __load_32(ptr, order, scope) - elseif sizeof(T) == 8 - __load_64(ptr, order, scope) +function reg(sz) + if sz == 1 + "r" + elseif sz == 2 + "h" + elseif sz == 4 + "r" + elseif sz == 8 + "l" + end +end + +function addr_space(A) + if A == AS.Global + as = ".global" + elseif A == AS.Shared + as = ".shared" else - throw(AtomicUnsupported{T}()) + as = "" end end -__supports_atomic(::Type{T}) where T = sizeof(T) == 4 || sizeof(T) == 8 +for (order, scope, A, sz) in Iterators.product( + (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (2,4,8)) + instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1];" + constraint = "=$(reg(sz)),l,~{memory}" + @eval @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr) +end -# Could be done using LLVM -# TODO: Register choice for Float32/Float64 -@inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS} - if sizeof(T) == 1 - val = @asmcall("ld.volatile.b8 \$0, [\$1];", "=r,l,~{memory}", true, UInt32, Tuple{LLVMPtr{T, AS}}, ptr) - return val % T - elseif sizeof(T) == 2 - val = @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, UInt16, Tuple{LLVMPtr{T, AS}}, ptr) - return Core.bitcast(T, val) # Float16 otherwise complaints - elseif sizeof(T) == 4 - @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) - elseif sizeof(T) == 8 - @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr) - else - throw(AtomicUnsupported{T}()) +# Handle byte sized load +for (order, scope, A) in Iterators.product( + (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared)) + instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).b8 \$0, [\$1];" + constraint = "=r,l,~{memory}" + @eval function @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) + return Core.bitcast(T, val % UInt8) + end +end + +@inline __load(ptr::LLVMPtr{T}, order, scope) where T = + __load(Val(sizeof(T)), ptr, order, scope) + +for (A, sz) in Iterators.product( + (AS.Generic, AS.Global, AS.Shared), + (2,4,8)) + instruction = "ld$(addr_space(A)).volatile.$(suffix(sz)) \$0, [\$1];" + constraint = "=$(reg(sz)),l,~{memory}" + @eval @inline __load_volatile(::Val{$sz}, ptr::LLVMPtr{T, $A}) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr) +end + +# Handle byte sized load +for (A) in (AS.Generic, AS.Global, AS.Shared) + instruction = "ld$(addr_space(A)).volatile.b8 \$0, [\$1];" + constraint = "=r,l,~{memory}" + @eval @inline function __load_volatile(::Val{1}, ptr::LLVMPtr{T, $A}) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) + return Core.bitcast(T, val % UInt8) end end +@inline __load_volatile(ptr::LLVMPtr{T}) where {T} = + __load_volatile(Val(sizeof(T)), ptr) + @inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == release throw(AtomicOrderUnsupported(order)) end - if compute_capability() >= sv"7.0" && __supports_atomic(T) + if compute_capability() >= sv"7.0" if order == monotonic val = __load(ptr, monotonic, scope) return val @@ -444,46 +487,40 @@ end end end -for (order, scope) in Iterators.product((LLVMOrdering{:release}, LLVMOrdering{:monotonic}), - (BlockScope, DeviceScope, SystemScope)) - asm_b64 = "st.$(asm(order)).$(asm(scope)).b64 [\$0], \$1;" - asm_b32 = "st.$(asm(order)).$(asm(scope)).b32 [\$0], \$1;" - @eval @inline __store_64!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b64, "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) - @eval @inline __store_32!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} = - @asmcall($asm_b32, "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) +for (order, scope, A, sz) in Iterators.product( + (LLVMOrdering{:release}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (1, 2, 4, 8)) + instruction = "st$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) [\$0], \$1;" + constraint = "l,$(reg(sz)),~{memory}" + @eval @inline __store!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T, ::$order, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val) end -@inline function __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T - if sizeof(T) == 4 - __store_32!(ptr, val, order, scope) - elseif sizeof(T) == 8 - __store_64!(ptr, val, order, scope) - else - throw(AtomicUnsupported{T}()) - end +@inline __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T = + __store!(Val(sizeof(T)), ptr, val, order, scope) + +for (A, sz) in Iterators.product( + (LLVMOrdering{:release}, LLVMOrdering{:monotonic}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (1, 2, 4, 8)) + instruction = "st$(addr_space(A)).volatile.$(suffix(sz)) [\$0], \$1;" + constraint = "l,$(reg(sz)),~{memory}" + @eval @inline __store_volatile!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T) where {T} = + @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val) end # Could be done using LLVM. -@inline function __store_volatile!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} - if sizeof(T) == 1 - @asmcall("st.volatile.b8 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) - elseif sizeof(T) == 2 - @asmcall("st.volatile.b16 [\$0], \$1;", "l,h,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) - elseif sizeof(T) == 4 - @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) - elseif sizeof(T) == 8 - @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val) - else - throw(AtomicUnsupported{T}()) - end -end +@inline __store_volatile!(ptr::LLVMPtr{T}, val::T) where {T} = + __store_volatile(Val(sizeof(T)), ptr, val) @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == acquire # || order == consume throw(AtomicOrderUnsupported(order)) end - if compute_capability() >= sv"7.0" && __supports_atomic(T) + if compute_capability() >= sv"7.0" if order == release __store!(ptr, val, release, scope) return @@ -581,7 +618,7 @@ for (scope, A) in Iterators.product((BlockScope, DeviceScope, SystemScope), @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) @eval @inline __cas_volatile_16!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = - @asmcall($asm_b32, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + @asmcall($asm_b16, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end @inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T @@ -598,7 +635,7 @@ end @inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) - if compute_capability() >= sv"7.0" && __supports_atomic(T) + if compute_capability() >= sv"7.0" && 2 <= sizeof(T) <= 4 if order == seq_cst atomic_thread_fence(seq_cst, scope) end @@ -619,11 +656,10 @@ end if order == seq_cst || order == acq_rel || order == release threadfence(scope) end - val = atomic_cas!(ptr, expected, new) + old = atomic_cas!(ptr, expected, new) if order == seq_cst || order == acq_rel || order == acquire # order == consume threadfence(scope) end - return val end success = expected === old # egal since otherwise NaN's won't work. return (; old, success) From 353e0367b7211f276b252d1b8bcb6b8cdb9a3b89 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 12 Mar 2023 10:29:05 -0400 Subject: [PATCH 21/32] fixup! no I am not losing my mind --- src/device/intrinsics/atomics.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 57627efef4..b6703d87a7 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -418,7 +418,7 @@ for (order, scope, A) in Iterators.product( (AS.Generic, AS.Global, AS.Shared)) instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).b8 \$0, [\$1];" constraint = "=r,l,~{memory}" - @eval function @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} + @eval @inline function __load(::Val{1}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) return Core.bitcast(T, val % UInt8) end @@ -502,8 +502,6 @@ end __store!(Val(sizeof(T)), ptr, val, order, scope) for (A, sz) in Iterators.product( - (LLVMOrdering{:release}, LLVMOrdering{:monotonic}), - (BlockScope, DeviceScope, SystemScope), (AS.Generic, AS.Global, AS.Shared), (1, 2, 4, 8)) instruction = "st$(addr_space(A)).volatile.$(suffix(sz)) [\$0], \$1;" From 334fad2d8b781067b58929ca2f04d34f071be15a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 12 Mar 2023 10:45:56 -0400 Subject: [PATCH 22/32] cleanup cas --- src/device/intrinsics/atomics.jl | 94 ++++++++++---------------------- 1 file changed, 30 insertions(+), 64 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index b6703d87a7..4d6e5ab65e 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -570,66 +570,32 @@ Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b)) @assert(false) end -for (order, scope, A) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), - (BlockScope, DeviceScope, SystemScope), - (AS.Generic, AS.Global, AS.Shared)) - if A == AS.Global - as = ".global" - elseif A == AS.Shared - as = ".shared" - else - as = "" - end - asm_b64 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" - asm_b32 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" - @eval @inline __cas_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = - @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) - @eval @inline __cas_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = - @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) +for (order, scope, A, sz) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared), + (4, 8)) + instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" + constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" + @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end -@inline function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T - if sizeof(T) == 4 - __cas_32!(ptr, old, new, order, scope) - elseif sizeof(T) == 8 - __cas_64!(ptr, old, new, order, scope) - else - throw(AtomicUnsupported{T}()) - end -end +@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T = + __cas(sizeof(T), ptr, old, new, order, scope) -for (scope, A) in Iterators.product((BlockScope, DeviceScope, SystemScope), - (AS.Generic, AS.Global, AS.Shared)) - if A == AS.Global - as = ".global" - elseif A == AS.Shared - as = ".shared" - else - as = "" - end - - asm_b64 = "atom$(as).cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;" - asm_b32 = "atom$(as).cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;" - asm_b16 = "atom$(as).cas.$(asm(scope)).b16 \$0,[\$1],\$2,\$3;" - @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = - @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) - @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = - @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) - @eval @inline __cas_volatile_16!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = - @asmcall($asm_b16, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) +for (scope, A, sz) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (AS.Generic, AS.Global, AS.Shared), + (2, 4, 8)) + instruction = "atom$(addr_space(A)).cas.$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" + constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" + @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = + @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end -@inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T - if sizeof(T) == 2 - __cas_volatile_16!(ptr, old, new, scope) - elseif sizeof(T) == 4 - __cas_volatile_32!(ptr, old, new, scope) - elseif sizeof(T) == 8 - __cas_volatile_64!(ptr, old, new, scope) - else - throw(AtomicUnsupported{T}()) - end -end +@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T = + __cas!(Val(sizeof(T)), ptr, old, new, scope) @inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) @@ -645,7 +611,7 @@ end if order == seq_cst || order == acq_rel || order == release atomic_thread_fence(seq_cst, scope) end - old = __cas_volatile!(ptr, expected, new, scope) + old = __cas!(ptr, expected, new, scope) if order == seq_cst || order == acq_rel || order == acquire # order == consume atomic_thread_fence(seq_cst, scope) end @@ -659,8 +625,7 @@ end threadfence(scope) end end - success = expected === old # egal since otherwise NaN's won't work. - return (; old, success) + return old end # @@ -695,14 +660,15 @@ end end @inline function modify!(ptr::LLVMPtr{T}, op::OP, x, order) where {T, OP} - success = false - expected = atomic_load(ptr, order) - local new::T - while !success + old = atomic_load(ptr, order) + while true + expected = old new = op(expected, x) - expected, success = atomic_cas!(ptr, expected, new, order, monotonic) + old = atomic_cas!(ptr, expected, new, order, monotonic) + if old === expected + return expected => new + end end - return expected => new end @inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP} From 68bf406574094e6b70da73cc2acf1b97aecf3521 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 12 Mar 2023 15:16:29 -0400 Subject: [PATCH 23/32] try to test sm_60, sm_35 --- src/compiler/execution.jl | 2 +- src/compiler/gpucompiler.jl | 2 +- src/compiler/reflection.jl | 2 +- src/device/intrinsics/atomics.jl | 8 ++--- test/device/intrinsics/lowlevel_atomics.jl | 34 ++++++++++++++++------ 5 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 3ac23cb4dc..c16277d1fd 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -40,7 +40,7 @@ macro cuda(ex...) macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = split_kwargs(kwargs, [:dynamic, :launch], - [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline], + [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline, :cap], [:cooperative, :blocks, :threads, :shmem, :stream]) if !isempty(other_kwargs) key,val = first(other_kwargs).args diff --git a/src/compiler/gpucompiler.jl b/src/compiler/gpucompiler.jl index bd39307b3a..a6364ebbbc 100644 --- a/src/compiler/gpucompiler.jl +++ b/src/compiler/gpucompiler.jl @@ -15,7 +15,7 @@ function device_properties(dev) cap = maximum(caps) # select the PTX ISA we assume to be available - # (we actually only need 6.2, but NVPTX doesn't support that) + # 6.3 introduced `atom.cas.b16` ptx = v"6.3" # we need to take care emitting LLVM instructions like `unreachable`, which diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl index 19e9f66d3d..4a1cb4a36d 100644 --- a/src/compiler/reflection.jl +++ b/src/compiler/reflection.jl @@ -125,7 +125,7 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native) function $method(io::IO, @nospecialize(func), @nospecialize(types); kernel::Bool=false, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing, always_inline::Bool=false, - kwargs...) + cap=capability(device()), kwargs...) source = FunctionSpec(func, Base.to_tuple_type(types), kernel) target = CUDACompilerTarget(device(); minthreads, maxthreads, blocks_per_sm, maxregs) params = CUDACompilerParams() diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 4d6e5ab65e..e8fe0f0cc6 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -405,7 +405,7 @@ for (order, scope, A, sz) in Iterators.product( (BlockScope, DeviceScope, SystemScope), (AS.Generic, AS.Global, AS.Shared), (2,4,8)) - instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1];" + instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).$(suffix(sz)) \$0, [\$1];" constraint = "=$(reg(sz)),l,~{memory}" @eval @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} = @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr) @@ -416,7 +416,7 @@ for (order, scope, A) in Iterators.product( (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}), (BlockScope, DeviceScope, SystemScope), (AS.Generic, AS.Global, AS.Shared)) - instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).b8 \$0, [\$1];" + instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).b8 \$0, [\$1];" constraint = "=r,l,~{memory}" @eval @inline function __load(::Val{1}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) @@ -430,7 +430,7 @@ end for (A, sz) in Iterators.product( (AS.Generic, AS.Global, AS.Shared), (2,4,8)) - instruction = "ld$(addr_space(A)).volatile.$(suffix(sz)) \$0, [\$1];" + instruction = "ld.volatile$(addr_space(A)).$(suffix(sz)) \$0, [\$1];" constraint = "=$(reg(sz)),l,~{memory}" @eval @inline __load_volatile(::Val{$sz}, ptr::LLVMPtr{T, $A}) where {T} = @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr) @@ -438,7 +438,7 @@ end # Handle byte sized load for (A) in (AS.Generic, AS.Global, AS.Shared) - instruction = "ld$(addr_space(A)).volatile.b8 \$0, [\$1];" + instruction = "ld.volatile$(addr_space(A)).b8 \$0, [\$1];" constraint = "=r,l,~{memory}" @eval @inline function __load_volatile(::Val{1}, ptr::LLVMPtr{T, $A}) where {T} val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr) diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl index 7122cd984f..0bedb80a3e 100644 --- a/test/device/intrinsics/lowlevel_atomics.jl +++ b/test/device/intrinsics/lowlevel_atomics.jl @@ -1,25 +1,41 @@ using BFloat16s: BFloat16 +function atomic_types(cap) + types = [ + Int32, Int64, + UInt32, UInt64, + Float64, Float32] + if cap >= v"6.0" + append!(types, [ + Int8, Int16, + UInt8, UInt16, + Float16]) + end + return types +end + @testset "atomics (low-level) with order" begin @testset "atomic_load" begin - if capability(device()) >= v"6.0" - types = [Int8, Int16, Int32, Int64, - UInt8, UInt16, UInt32, UInt64, - Float64, Float32] + capabilities = (v"3.5", v"6.0", v"7.0") + current_cap = capability(device()) + + capabilities = filter(c->c<=current_cap, capabilities) + + @testset for cap in capabilities + types = atomic_types(cap) scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] - # TODO unordered - supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] - unsupported_orders = [CUDA.release, CUDA.acq_rel] + orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] + # unsupported_orders = [CUDA.release, CUDA.acq_rel] function kernel(a, order, scope) CUDA.atomic_load(pointer(a), order, scope) return end - @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes) + @testset for (T, order, scope) in Iterators.product(types, orders, scopes) a = CuArray(T[0]) - @cuda threads=1 kernel(a, order, scope) + @cuda cap=cap threads=1 kernel(a, order, scope) @test Array(a)[1] == 0 end end From f248574ccc75f6142047fb38de1e3373677a9485 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 13 Mar 2023 21:33:25 -0400 Subject: [PATCH 24/32] fix yet another silly mistake --- src/device/intrinsics/synchronization.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index e98baa16a2..fd59214452 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -131,11 +131,11 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope) throw(AtomicOrderUnsupported(order)) end else - if order == seq_cst() || - order == consume() || - order == acquire() || - order == acq_rel() || - order == release() + if order == seq_cst || + order == consume || + order == acquire || + order == acq_rel || + order == release threadfence_device() else From 992630ec189c66ac18dfc5273e5af1308ae02205 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:03:15 -0400 Subject: [PATCH 25/32] walkback sm_50 support --- src/compiler/execution.jl | 2 +- src/device/intrinsics/atomics.jl | 26 ++-------------- test/device/intrinsics/lowlevel_atomics.jl | 36 ++++++---------------- 3 files changed, 13 insertions(+), 51 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index c16277d1fd..3ac23cb4dc 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -40,7 +40,7 @@ macro cuda(ex...) macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = split_kwargs(kwargs, [:dynamic, :launch], - [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline, :cap], + [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline], [:cooperative, :blocks, :threads, :shmem, :stream]) if !isempty(other_kwargs) key,val = first(other_kwargs).args diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index e8fe0f0cc6..cf60d2d077 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -474,16 +474,7 @@ end atomic_thread_fence(order, scope) return val else - # Fallback to threadfence w/o order + load_volatile - if order == seq_cst - threadfence(scope) - end - val = __load_volatile(ptr) - if order == monotonic - return val - end - threadfence(scope) - return val + throw(AtomicUnsupported{T}()) end end @@ -533,11 +524,7 @@ end end __store_volatile!(ptr, val) else - # Fallback to threadfence w/o order + store_volatile - if order == seq_cst - threadfence(scope) - end - __store_volatile!(ptr, val) + throw(AtomicUnsupported{T}()) end end @@ -616,14 +603,7 @@ end atomic_thread_fence(seq_cst, scope) end else - # Fallback to atomic_cas w/o scope on pre SM_60 - if order == seq_cst || order == acq_rel || order == release - threadfence(scope) - end - old = atomic_cas!(ptr, expected, new) - if order == seq_cst || order == acq_rel || order == acquire # order == consume - threadfence(scope) - end + throw(AtomicUnsupported{T}()) end return old end diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl index 0bedb80a3e..dfe2c06e96 100644 --- a/test/device/intrinsics/lowlevel_atomics.jl +++ b/test/device/intrinsics/lowlevel_atomics.jl @@ -1,32 +1,15 @@ using BFloat16s: BFloat16 -function atomic_types(cap) - types = [ - Int32, Int64, - UInt32, UInt64, - Float64, Float32] - if cap >= v"6.0" - append!(types, [ - Int8, Int16, - UInt8, UInt16, - Float16]) - end - return types -end - @testset "atomics (low-level) with order" begin @testset "atomic_load" begin - capabilities = (v"3.5", v"6.0", v"7.0") - current_cap = capability(device()) - - capabilities = filter(c->c<=current_cap, capabilities) - - @testset for cap in capabilities - types = atomic_types(cap) + if capability(device()) >= v"6.0" + types = [Int8, Int16, Int32, Int64, + UInt8, UInt16, UInt32, UInt64, + Float64, Float32] + # TODO Float16 scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] - # unsupported_orders = [CUDA.release, CUDA.acq_rel] function kernel(a, order, scope) CUDA.atomic_load(pointer(a), order, scope) @@ -35,7 +18,7 @@ end @testset for (T, order, scope) in Iterators.product(types, orders, scopes) a = CuArray(T[0]) - @cuda cap=cap threads=1 kernel(a, order, scope) + @cuda threads=1 kernel(a, order, scope) @test Array(a)[1] == 0 end end @@ -46,17 +29,16 @@ end types = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float64, Float32] + # TODO Float16 scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] - # TODO unordered - supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] - unsupported_orders = [CUDA.acquire, CUDA.acq_rel] + orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] function kernel(a, val, order, scope) CUDA.atomic_store!(pointer(a), val, order, scope) return end - @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes) + @testset for (T, order, scope) in Iterators.product(types, orders, scopes) a = CuArray(T[0]) @cuda threads=1 kernel(a, one(T), order, scope) @test Array(a)[1] == one(T) From fa0e3a290afca3b18e278e636db8e50d1d662f73 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:15:12 -0400 Subject: [PATCH 26/32] fix CAS call --- src/device/intrinsics/atomics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index cf60d2d077..acef5aaed5 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -569,7 +569,7 @@ for (order, scope, A, sz) in Iterators.product( end @inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T = - __cas(sizeof(T), ptr, old, new, order, scope) + __cas!(sizeof(T), ptr, old, new, order, scope) for (scope, A, sz) in Iterators.product( (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), From c9396fe7e484564924beba10f689eb266801dd95 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:18:12 -0400 Subject: [PATCH 27/32] fix threadfence on sm_60 --- src/device/intrinsics/synchronization.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index fd59214452..2cf19e7cf4 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -132,7 +132,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope) end else if order == seq_cst || - order == consume || + # order == consume || order == acquire || order == acq_rel || order == release From 2f5948807e55eb1bbc6feb5ec729ab60c97ed6cb Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:28:14 -0400 Subject: [PATCH 28/32] fixup! fix CAS call --- src/device/intrinsics/atomics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index acef5aaed5..7d041e355d 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -569,7 +569,7 @@ for (order, scope, A, sz) in Iterators.product( end @inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T = - __cas!(sizeof(T), ptr, old, new, order, scope) + __cas!(Val(sizeof(T)), ptr, old, new, order, scope) for (scope, A, sz) in Iterators.product( (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), From f937f114e7f04b55b22c371937dff10484046ef5 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:28:28 -0400 Subject: [PATCH 29/32] fix store_volatile! call --- src/device/intrinsics/atomics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 7d041e355d..65c8348ced 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -503,7 +503,7 @@ end # Could be done using LLVM. @inline __store_volatile!(ptr::LLVMPtr{T}, val::T) where {T} = - __store_volatile(Val(sizeof(T)), ptr, val) + __store_volatile!(Val(sizeof(T)), ptr, val) @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T if order == acq_rel || order == acquire # || order == consume From f7e938a24a28727819921a7378d91fd94abf0f55 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:28:41 -0400 Subject: [PATCH 30/32] test Float16 and BFloat16 --- test/device/intrinsics/lowlevel_atomics.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl index dfe2c06e96..1722b0af3e 100644 --- a/test/device/intrinsics/lowlevel_atomics.jl +++ b/test/device/intrinsics/lowlevel_atomics.jl @@ -6,8 +6,7 @@ using BFloat16s: BFloat16 if capability(device()) >= v"6.0" types = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, - Float64, Float32] - # TODO Float16 + BFloat16, Float16, Float64, Float32] scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst] @@ -28,8 +27,7 @@ end if capability(device()) >= v"6.0" types = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, - Float64, Float32] - # TODO Float16 + BFloat16, Float16, Float64, Float32] scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst] From aed3933fc85813253e32a5ebc0badd2ba726c4ef Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 Mar 2023 21:44:42 -0400 Subject: [PATCH 31/32] fix sizeof checks in cas --- src/device/intrinsics/atomics.jl | 19 ++++++++++++++----- test/device/intrinsics/lowlevel_atomics.jl | 1 - 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index 65c8348ced..a370baa912 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -453,6 +453,9 @@ end if order == acq_rel || order == release throw(AtomicOrderUnsupported(order)) end + if sizeof(T) > 8 + throw(AtomicUnsupported{T}()) + end if compute_capability() >= sv"7.0" if order == monotonic val = __load(ptr, monotonic, scope) @@ -509,6 +512,9 @@ end if order == acq_rel || order == acquire # || order == consume throw(AtomicOrderUnsupported(order)) end + if sizeof(T) > 8 + throw(AtomicUnsupported{T}()) + end if compute_capability() >= sv"7.0" if order == release __store!(ptr, val, release, scope) @@ -577,16 +583,19 @@ for (scope, A, sz) in Iterators.product( (2, 4, 8)) instruction = "atom$(addr_space(A)).cas.$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" - @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = + @eval @inline __cas_old!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end -@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T = - __cas!(Val(sizeof(T)), ptr, old, new, scope) +@inline __cas_old!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T = + __cas_old!(Val(sizeof(T)), ptr, old, new, scope) @inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T order = stronger_order(success_order, failure_order) - if compute_capability() >= sv"7.0" && 2 <= sizeof(T) <= 4 + if sizeof(T) > 8 || sizeof(T) < 2 + throw(AtomicUnsupported{T}()) + end + if compute_capability() >= sv"7.0" && 4 <= sizeof(T) <= 8 if order == seq_cst atomic_thread_fence(seq_cst, scope) end @@ -598,7 +607,7 @@ end if order == seq_cst || order == acq_rel || order == release atomic_thread_fence(seq_cst, scope) end - old = __cas!(ptr, expected, new, scope) + old = __cas_old!(ptr, expected, new, scope) if order == seq_cst || order == acq_rel || order == acquire # order == consume atomic_thread_fence(seq_cst, scope) end diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl index 1722b0af3e..0db2c30527 100644 --- a/test/device/intrinsics/lowlevel_atomics.jl +++ b/test/device/intrinsics/lowlevel_atomics.jl @@ -51,7 +51,6 @@ end UInt32, UInt64, Float64, Float32] scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope] - # TODO unordered orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel] function kernel(a, expected, desired, success_order, failure_order, scope) From 66fadf5014cb33b84ae9aee235e3a1c0c90cd179 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 21 Mar 2023 12:17:52 -0400 Subject: [PATCH 32/32] handle byte size cas --- src/device/intrinsics/atomics.jl | 33 ++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index a370baa912..876a456652 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -567,26 +567,51 @@ for (order, scope, A, sz) in Iterators.product( (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), (BlockScope, DeviceScope, SystemScope), (AS.Generic, AS.Global, AS.Shared), - (4, 8)) + (2, 4, 8)) instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} = @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end +# Handle byte sized cas +for (order, scope, A) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (BlockScope, DeviceScope, SystemScope), + (AS.Generic, AS.Global, AS.Shared)) + instruction = "atom.$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).b8 \$0, [\$1];" + constraint = "=r,l,r,r,~{memory}" + @eval @inline function __cas!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + return Core.bitcast(T, val % UInt8) + end +end + @inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T = __cas!(Val(sizeof(T)), ptr, old, new, order, scope) -for (scope, A, sz) in Iterators.product( +for (order, A, sz) in Iterators.product( (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), (AS.Generic, AS.Global, AS.Shared), (2, 4, 8)) - instruction = "atom$(addr_space(A)).cas.$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" + instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(suffix(sz)) \$0, [\$1], \$2, \$3;" constraint = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}" @eval @inline __cas_old!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} = @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) end +# Handle byte sized cas +for (order, A) in Iterators.product( + (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}), + (AS.Generic, AS.Global, AS.Shared)) + instruction = "atom.$(addr_space(A)).cas.$(asm(order)).b8 \$0, [\$1];" + constraint = "=r,l,r,r,~{memory}" + @eval @inline function __cas_old!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} + val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new) + return Core.bitcast(T, val % UInt8) + end +end + @inline __cas_old!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T = __cas_old!(Val(sizeof(T)), ptr, old, new, scope) @@ -595,7 +620,7 @@ end if sizeof(T) > 8 || sizeof(T) < 2 throw(AtomicUnsupported{T}()) end - if compute_capability() >= sv"7.0" && 4 <= sizeof(T) <= 8 + if compute_capability() >= sv"7.0" if order == seq_cst atomic_thread_fence(seq_cst, scope) end