From a82a8447deebdf0056ab81bd5805876a373a8c27 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 13 Feb 2023 09:13:54 -0500
Subject: [PATCH 01/32] Implement KernelAbstractions backend in CUDA.jl

Formerly know as CUDAKernels.jl
---
 Manifest.toml              |  62 +++++++--
 Project.toml               |   3 +
 src/CUDA.jl                |   4 +
 src/CUDAKernels.jl         | 254 +++++++++++++++++++++++++++++++++++++
 test/Project.toml          |   3 +
 test/kernelabstractions.jl |  16 +++
 6 files changed, 334 insertions(+), 8 deletions(-)
 create mode 100644 src/CUDAKernels.jl
 create mode 100644 test/kernelabstractions.jl

diff --git a/Manifest.toml b/Manifest.toml
index 6225d8d8a7..2d634a920b 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,9 +2,9 @@
 
 [[AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409"
+git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.2.1"
+version = "1.3.1"
 
 [[Adapt]]
 deps = ["LinearAlgebra", "Requires"]
@@ -18,6 +18,12 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
 [[Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
+[[Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "0.1.0"
+
 [[BFloat16s]]
 deps = ["LinearAlgebra", "Printf", "Random", "Test"]
 git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
@@ -64,9 +70,9 @@ version = "0.1.6"
 
 [[Compat]]
 deps = ["Dates", "LinearAlgebra", "UUIDs"]
-git-tree-sha1 = "61fdd77467a5c3ad071ef8277ac6bd6af7dd4c04"
+git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.6.0"
+version = "4.6.1"
 
 [[CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -87,9 +93,9 @@ deps = ["ArgTools", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
 [[ExprTools]]
-git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d"
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.8"
+version = "0.1.9"
 
 [[GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
@@ -105,9 +111,9 @@ version = "0.1.4"
 
 [[GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "95185985a5d2388c6d0fedb06181ad4ddd40e0cb"
+git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.17.2"
+version = "0.17.3"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -130,6 +136,12 @@ git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
 version = "1.4.1"
 
+[[KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "17d0bb94eef881b09c57967be12cca70fefb3304"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.9.0"
+
 [[LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
 git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04"
@@ -178,6 +190,12 @@ version = "0.3.23"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
+[[MacroTools]]
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.10"
+
 [[Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
@@ -253,6 +271,12 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SnoopPrecompile]]
+deps = ["Preferences"]
+git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c"
+uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c"
+version = "1.0.3"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -266,6 +290,17 @@ git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
 version = "2.2.0"
 
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
+git-tree-sha1 = "2d7d9e1ddadc8407ffd460e24218e37ef52dd9a3"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "1.5.16"
+
+[[StaticArraysCore]]
+git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
+uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+version = "1.4.0"
+
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -295,6 +330,17 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
+[[UnsafeAtomics]]
+git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.2.1"
+
+[[UnsafeAtomicsLLVM]]
+deps = ["LLVM", "UnsafeAtomics"]
+git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e"
+uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+version = "0.1.0"
+
 [[Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
diff --git a/Project.toml b/Project.toml
index b18100dac1..19d8a411b3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ CompilerSupportLibraries_jll = "e66e0078-7015-5450-92f7-15fbd957f2ae"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -28,6 +29,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
 AbstractFFTs = "0.4, 0.5, 1.0"
@@ -47,4 +49,5 @@ RandomNumbers = "1.5.3"
 Reexport = "0.2, 1.0"
 Requires = "0.5, 1.0"
 SpecialFunctions = "1.3, 2"
+UnsafeAtomicsLLVM = "0.1"
 julia = "1.6"
diff --git a/src/CUDA.jl b/src/CUDA.jl
index 5ddccec286..0e918eae14 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -107,6 +107,10 @@ include("../lib/nvml/NVML.jl")
 const has_nvml = NVML.has_nvml
 export NVML, has_nvml
 
+# KernelAbstractions
+include("CUDAKernels.jl")
+export CUDABackend
+
 include("precompile.jl")
 
 end
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
new file mode 100644
index 0000000000..b43a1af8a5
--- /dev/null
+++ b/src/CUDAKernels.jl
@@ -0,0 +1,254 @@
+module CUDAKernels
+
+import KernelAbstractions
+import CUDA
+import UnsafeAtomicsLLVM
+import GPUCompiler
+
+struct CUDABackend <: KernelAbstractions.GPU
+    prefer_blocks::Bool
+    always_inline::Bool
+end
+CUDABackend(;prefer_blocks=false, always_inline=false) = CUDABackend(prefer_blocks, always_inline)
+
+export CUDABackend
+
+KernelAbstractions.allocate(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.CuArray{T}(undef, dims)
+KernelAbstractions.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims)
+KernelAbstractions.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims)
+
+# Import through parent
+import KernelAbstractions: StaticArrays, Adapt
+import .StaticArrays: MArray
+
+KernelAbstractions.get_backend(::CUDA.CuArray) = CUDABackend()
+KernelAbstractions.get_backend(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDABackend()
+
+KernelAbstractions.synchronize(::CUDABackend) = CUDA.synchronize()
+
+###
+# copyto!
+###
+# - IdDict does not free the memory
+# - WeakRef dict does not unique the key by objectid
+const __pinned_memory = Dict{UInt64, WeakRef}()
+
+function __pin!(a)
+    # use pointer instead of objectid?
+    oid = objectid(a)
+    if haskey(__pinned_memory, oid) && __pinned_memory[oid].value !== nothing
+        return nothing
+    end
+    ad = CUDA.Mem.register(CUDA.Mem.Host, pointer(a), sizeof(a))
+    finalizer(_ -> CUDA.Mem.unregister(ad), a)
+    __pinned_memory[oid] = WeakRef(a)
+    return nothing
+end
+
+function KernelAbstractions.copyto!(::CUDABackend, A, B)
+    A isa Array && __pin!(A)
+    B isa Array && __pin!(B)
+
+    GC.@preserve A B begin
+        destptr = pointer(A)
+        srcptr  = pointer(B)
+        N       = length(A)
+        unsafe_copyto!(destptr, srcptr, N, async=true)
+    end
+    return A
+end
+
+import KernelAbstractions: Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
+
+###
+# Kernel launch
+###
+function launch_config(kernel::Kernel{CUDABackend}, ndrange, workgroupsize)
+    if ndrange isa Integer
+        ndrange = (ndrange,)
+    end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
+
+    # partition checked that the ndrange's agreed
+    if KernelAbstractions.ndrange(kernel) <: StaticSize
+        ndrange = nothing
+    end
+
+    iterspace, dynamic = if KernelAbstractions.workgroupsize(kernel) <: DynamicSize &&
+        workgroupsize === nothing
+        # use ndrange as preliminary workgroupsize for autotuning
+        partition(kernel, ndrange, ndrange)
+    else
+        partition(kernel, ndrange, workgroupsize)
+    end
+
+    return ndrange, workgroupsize, iterspace, dynamic
+end
+
+function threads_to_workgroupsize(threads, ndrange)
+    total = 1
+    return map(ndrange) do n
+        x = min(div(threads, total), n)
+        total *= x
+        return x
+    end
+end
+
+function (obj::Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    backend = KernelAbstractions.backend(obj)
+
+    ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = mkcontext(obj, ndrange, iterspace)
+
+    # If the kernel is statically sized we can tell the compiler about that
+    if KernelAbstractions.workgroupsize(obj) <: StaticSize
+        maxthreads = prod(KernelAbstractions.get(KernelAbstractions.workgroupsize(obj)))
+    else
+        maxthreads = nothing
+    end
+
+    kernel = CUDA.@cuda launch=false always_inline=backend.always_inline maxthreads=maxthreads obj.f(ctx, args...)
+
+    # figure out the optimal workgroupsize automatically
+    if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
+        config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange))
+        if backend.prefer_blocks
+            # Prefer blocks over threads
+            threads = min(prod(ndrange), config.threads)
+            # XXX: Some kernels performs much better with all blocks active
+            cu_blocks = max(cld(prod(ndrange), threads), config.blocks)
+            threads = cld(prod(ndrange), cu_blocks)
+        else
+            threads = config.threads
+        end
+
+        workgroupsize = threads_to_workgroupsize(threads, ndrange)
+        iterspace, dynamic = partition(obj, ndrange, workgroupsize)
+        ctx = mkcontext(obj, ndrange, iterspace)
+    end
+
+    nblocks = length(blocks(iterspace))
+    threads = length(workitems(iterspace))
+
+    if nblocks == 0
+        return nothing
+    end
+
+    # Launch kernel
+    kernel(ctx, args...; threads=threads, blocks=nblocks)
+
+    return nothing
+end
+
+# list of overrides (only for Julia 1.6)
+const overrides = Expr[]
+
+macro device_override(ex)
+    ex = macroexpand(__module__, ex)
+    if Meta.isexpr(ex, :call)
+        @show ex = eval(ex)
+        error()
+    end
+    code = quote
+        $GPUCompiler.@override($CUDA.method_table, $ex)
+    end
+    if isdefined(Base.Experimental, Symbol("@overlay"))
+        return esc(code)
+    else
+        push!(overrides, code)
+        return
+    end
+end
+
+function __init__()
+    precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+    precompiling && return
+    # register device overrides
+    eval(Expr(:block, overrides...))
+    empty!(overrides)
+end
+
+import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
+import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
+import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
+
+function mkcontext(kernel::Kernel{CUDABackend}, _ndrange, iterspace)
+    CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
+end
+
+@device_override @inline function __index_Local_Linear(ctx)
+    return CUDA.threadIdx().x
+end
+
+@device_override @inline function __index_Group_Linear(ctx)
+    return CUDA.blockIdx().x
+end
+
+@device_override @inline function __index_Global_Linear(ctx)
+    I =  @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+    # TODO: This is unfortunate, can we get the linear index cheaper
+    @inbounds LinearIndices(__ndrange(ctx))[I]
+end
+
+@device_override @inline function __index_Local_Cartesian(ctx)
+    @inbounds workitems(__iterspace(ctx))[CUDA.threadIdx().x]
+end
+
+@device_override @inline function __index_Group_Cartesian(ctx)
+    @inbounds blocks(__iterspace(ctx))[CUDA.blockIdx().x]
+end
+
+@device_override @inline function __index_Global_Cartesian(ctx)
+    return @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+end
+
+@device_override @inline function __validindex(ctx)
+    if __dynamic_checkbounds(ctx)
+        I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+        return I in __ndrange(ctx)
+    else
+        return true
+    end
+end
+
+import KernelAbstractions: groupsize, __groupsize, __workitems_iterspace
+import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize, __size
+
+###
+# GPU implementation of shared memory
+###
+
+@device_override @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+    CUDA.CuStaticSharedArray(T, Dims)
+end
+
+###
+# GPU implementation of scratch memory
+# - private memory for each workitem
+###
+
+@device_override @inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
+    MArray{__size(Dims), T}(undef)
+end
+
+@device_override @inline function __synchronize()
+    CUDA.sync_threads()
+end
+
+@device_override @inline function __print(args...)
+    CUDA._cuprint(args...)
+end
+
+###
+# GPU implementation of const memory
+###
+
+Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental.Const(a)
+
+# Argument conversion
+KernelAbstractions.argconvert(k::Kernel{CUDABackend}, arg) = CUDA.cudaconvert(arg)
+
+end
diff --git a/test/Project.toml b/test/Project.toml
index 95252c2eda..ec5354dea7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -7,7 +7,9 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -15,6 +17,7 @@ REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
new file mode 100644
index 0000000000..05f5d36978
--- /dev/null
+++ b/test/kernelabstractions.jl
@@ -0,0 +1,16 @@
+import KernelAbstractions
+using Test
+
+include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
+
+using CUDA
+using CUDA.CUDAKernels
+
+if CUDA.functional()
+    CUDA.versioninfo()
+    CUDA.allowscalar(false)
+    Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CUDA.CuDeviceArray)
+    for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
+        Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CUDA.CuDeviceArray)
+    end
+end

From f510c5070cd156b2bde961cee4cbf8871ba0c415 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 15:31:22 -0500
Subject: [PATCH 02/32] Use Atomix

---
 Project.toml                             |   3 +
 src/CUDAKernels.jl                       |   3 +-
 src/device/intrinsics.jl                 |  16 ++
 src/device/intrinsics/atomics.jl         | 264 +++++++++++++++--------
 src/device/intrinsics/synchronization.jl | 139 +++++++++---
 test/device/intrinsics/atomics.jl        |   2 +-
 6 files changed, 305 insertions(+), 122 deletions(-)

diff --git a/Project.toml b/Project.toml
index 19d8a411b3..4e35c695f5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "4.0.1"
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
@@ -29,11 +30,13 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
 AbstractFFTs = "0.4, 0.5, 1.0"
 Adapt = "3.3"
+Atomix = "0.1"
 BFloat16s = "0.2, 0.3, 0.4"
 CEnum = "0.2, 0.3, 0.4"
 CUDA_Driver_jll = "0.2"
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index b43a1af8a5..628f528f75 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -2,8 +2,6 @@ module CUDAKernels
 
 import KernelAbstractions
 import CUDA
-import UnsafeAtomicsLLVM
-import GPUCompiler
 
 struct CUDABackend <: KernelAbstractions.GPU
     prefer_blocks::Bool
@@ -146,6 +144,7 @@ end
 # list of overrides (only for Julia 1.6)
 const overrides = Expr[]
 
+import GPUCompiler
 macro device_override(ex)
     ex = macroexpand(__module__, ex)
     if Meta.isexpr(ex, :call)
diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl
index 443a7fd420..30e797cbd7 100644
--- a/src/device/intrinsics.jl
+++ b/src/device/intrinsics.jl
@@ -3,6 +3,22 @@
 # special intrinsics for writing version-dependent code
 include("intrinsics/version.jl")
 
+abstract type SyncScope end
+struct SystemScope <: SyncScope end
+struct DeviceScope <: SyncScope end
+struct BlockScope <: SyncScope end
+
+const system_scope = SystemScope()
+const device_scope = DeviceScope()
+const block_scope = BlockScope()
+
+import UnsafeAtomics
+using UnsafeAtomics.Internal: LLVMOrdering
+using UnsafeAtomics: unordered, monotonic, acquire, release, acq_rel, seq_cst
+# Note CUDA C++ has also consume ordering which LLVM does not support
+# monotonic -> relaxed
+# unordered -> ??? maybe weak
+
 # extensions to the C language
 include("intrinsics/memory_shared.jl")
 include("intrinsics/indexing.jl")
diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 15b8c10e39..2021529f14 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -1,5 +1,7 @@
 # Atomic Functions (B.12)
 
+# TODO replace the below with UnsafeAtomicsLLVM if possible
+
 #
 # Low-level intrinsics
 #
@@ -357,117 +359,193 @@ This operation is only supported for values of type Int32.
 """
 atomic_dec!
 
+asm(::Type{LLVMOrdering{:monotonic}}) = :relaxed
+asm(::Type{LLVMOrdering{Order}}) where Order = Order
+
+asm(::Type{SystemScope}) = :sys
+asm(::Type{DeviceScope}) = :gpu
+asm(::Type{BlockScope}) = :cta
+
+for (order, scope) in Iterators.product((LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
+                                        (BlockScope, DeviceScope, SystemScope))
+    asm_b64 = "ld.$(asm(order)).$(asm(scope)).b64 \$0, [\$1];"
+    asm_b32 = "ld.$(asm(order)).$(asm(scope)).b32 \$0, [\$1];"
+    @eval @inline __load_64(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} =
+        @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr)
+    @eval @inline __load_32(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} =
+        @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr)
+end
 
+@inline function __load(ptr::LLVMPtr{T}, order, scope) where T
+    if sizeof(T) == 4
+        __load_32(ptr, order, scope)
+    elseif sizeof(T) == 8
+        __load_64(ptr, order, scope)
+    else
+        assert(false)
+    end
+end
 
-#
-# High-level interface
-#
-
-# prototype of a high-level interface for performing atomic operations on arrays
-#
-# this design could be generalized by having atomic {field,array}{set,ref} accessors, as
-# well as acquire/release operations to implement the fallback functionality where any
-# operation can be applied atomically.
+# Could be done using LLVM.
+@inline __load_volatile_64(ptr::LLVMPtr{T, AS}) where {T, AS} =
+    @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+@inline __load_volatile_32(ptr::LLVMPtr{T, AS}) where {T, AS} =
+    @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+
+@inline function __load_volatile(ptr::LLVMPtr{T}) where T
+    if sizeof(T) == 4
+        __load_volatile_32(ptr)
+    elseif sizeof(T) == 8
+        __load_volatile_64(ptr)
+    else
+        assert(false)
+    end
+end
 
-if VERSION <= v"1.7-"
-export @atomic
+@inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T
+    if order == acq_rel || order == release
+        assert(false)
+    end
+    if compute_capability() >= sv"7.0"
+        if order == monotonic
+            val = __load(ptr, monotonic, scope)
+            return val
+        end
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        val = __load(ptr, acquire, scope)
+        return val
+    else
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        val = __load_volatile(ptr)
+        if order == monotonic
+            return val
+        end
+        atomic_thread_fence(order, scope)
+        return val
+    end
 end
 
-const inplace_ops = Dict(
-    :(+=)   => :(+),
-    :(-=)   => :(-),
-    :(*=)   => :(*),
-    :(/=)   => :(/),
-    :(\=)   => :(\),
-    :(%=)   => :(%),
-    :(^=)   => :(^),
-    :(&=)   => :(&),
-    :(|=)   => :(|),
-    :(⊻=)   => :(⊻),
-    :(>>>=) => :(>>>),
-    :(>>=)  => :(>>),
-    :(<<=)  => :(<<),
-)
+for (order, scope) in Iterators.product((LLVMOrdering{:release}, LLVMOrdering{:monotonic}),
+                                        (BlockScope, DeviceScope, SystemScope))
+    asm_b64 = "st.$(asm(order)).$(asm(scope)).b64 [\$0], \$1;"
+    asm_b32 = "st.$(asm(order)).$(asm(scope)).b32 [\$0], \$1;"
+    @eval @inline __store_64!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} =
+        @asmcall($asm_b64, "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+    @eval @inline __store_32!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} =
+        @asmcall($asm_b32, "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+end
 
-struct AtomicError <: Exception
-    msg::AbstractString
+@inline function __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T
+    if sizeof(T) == 4
+        __store_32!(ptr, val, order, scope)
+    elseif sizeof(T) == 8
+        __store_64!(ptr, val, order, scope)
+    else
+        assert(false)
+    end
 end
 
-Base.showerror(io::IO, err::AtomicError) =
-    print(io, "AtomicError: ", err.msg)
+# Could be done using LLVM.
+@inline __store_volatile_32!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} =
+    @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+@inline __store_volatile_64!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} =
+    @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+
+@inline function __store_volatile!(ptr::LLVMPtr{T}, val::T) where T
+    if sizeof(T) == 4
+        __store_volatile_32!(ptr, val)
+    elseif sizeof(T) == 8
+        __store_volatile_64!(ptr, val)
+    else
+        assert(false)
+    end
+end
 
-"""
-    @atomic a[I] = op(a[I], val)
-    @atomic a[I] ...= val
-
-Atomically perform a sequence of operations that loads an array element `a[I]`, performs the
-operation `op` on that value and a second value `val`, and writes the result back to the
-array. This sequence can be written out as a regular assignment, in which case the same
-array element should be used in the left and right hand side of the assignment, or as an
-in-place application of a known operator. In both cases, the array reference should be pure
-and not induce any side-effects.
-
-!!! warn
-    This interface is experimental, and might change without warning.  Use the lower-level
-    `atomic_...!` functions for a stable API, albeit one limited to natively-supported ops.
-"""
-macro atomic(ex)
-    # decode assignment and call
-    if ex.head == :(=)
-        ref = ex.args[1]
-        rhs = ex.args[2]
-        Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call"))
-        op = rhs.args[1]
-        if rhs.args[2] != ref
-            throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side"))
+@inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T
+    if order == acq_rel || order == acquire # || order == consume
+        assert(false)
+    end
+    if compute_capability() >= sv"7.0"
+        if order == release
+            __store!(ptr, val, release, scope)
+            return
+        end
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
         end
-        val = rhs.args[3]
-    elseif haskey(inplace_ops, ex.head)
-        op = inplace_ops[ex.head]
-        ref = ex.args[1]
-        val = ex.args[2]
+        __store!(ptr, val, monotonic, scope)
     else
-        throw(AtomicError("unknown @atomic expression"))
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        __store_volatile!(ptr, val)
     end
+end
 
-    # decode array expression
-    Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression"))
-    array = ref.args[1]
-    indices = Expr(:tuple, ref.args[2:end]...)
+#
+# High-level interface
+#
+import Atomix: @atomic, @atomicswap, @atomicreplace
+# import UnsafeAtomicsLLVM
+
+if VERSION <= v"1.7"
+    export @atomic
+end
 
-    esc(quote
-        $atomic_arrayset($array, $indices, $op, $val)
-    end)
+using Atomix: Atomix, IndexableRef
+
+const CuIndexableRef{Indexable<:CuDeviceArray} = IndexableRef{Indexable}
+
+@inline function Atomix.get(ref::CuIndexableRef, order)
+    atomic_load(Atomix.pointer(ref), order)
 end
 
-# FIXME: make this respect the indexing style
-@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} =
-    atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
-
-# native atomics
-for (op,impl,typ) in [(+,   atomic_add!, [UInt32,Int32,UInt64,Int64,Float32]),
-                      (-,   atomic_sub!, [UInt32,Int32,UInt64,Int64,Float32]),
-                      (&,   atomic_and!, [UInt32,Int32,UInt64,Int64]),
-                      (|,   atomic_or!,  [UInt32,Int32,UInt64,Int64]),
-                      (⊻,   atomic_xor!, [UInt32,Int32,UInt64,Int64]),
-                      (max, atomic_max!, [UInt32,Int32,UInt64,Int64]),
-                      (min, atomic_min!, [UInt32,Int32,UInt64,Int64])]
-    @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
-                                  val::T) where {T<:Union{$(typ...)}} =
-        $impl(pointer(A, I), val)
+@inline function Atomix.set!(ref::CuIndexableRef, v, order)
+    atomic_store!(Atomix.pointer(ref), v, order)
 end
 
-# native atomics that are not supported on all devices
-@inline function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::typeof(+),
-                                 val::T) where {T <: Union{Float64}}
-    ptr = pointer(A, I)
-    if compute_capability() >= sv"6.0"
-        atomic_add!(ptr, val)
-    else
-        atomic_op!(ptr, op, val)
+@inline function Atomix.replace!(
+    ref::CuIndexableRef,
+    expected,
+    desired,
+    success_ordering,
+    failure_ordering,
+)
+    # TODO success_ordering and failure
+    ptr = Atomix.pointer(ref)
+    expected = convert(eltype(ref), expected)
+    desired = convert(eltype(ref), desired)
+    begin
+        old = atomic_cas!(ptr, expected, desired)
     end
+    return (; old = old, success = old === expected)
 end
 
-# fallback using compare-and-swap
-@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
-    atomic_op!(pointer(A, I), op, val)
+@inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP}
+    x = convert(eltype(ref), x)
+    ptr = Atomix.pointer(ref)
+    begin
+        old = if op === (+)
+            atomic_add!(ptr, x)
+        elseif op === (-)
+            atomic_sub!(ptr, x)
+        elseif op === (&)
+            atomic_and!(ptr, x)
+        elseif op === (|)
+            atomic_or!(ptr, x)
+        elseif op === xor
+            atomic_xor!(ptr, x)
+        elseif op === min
+            atomic_min!(ptr, x)
+        elseif op === max
+            atomic_max!(ptr, x)
+        else
+            error("not implemented")
+        end
+    end
+    return old => op(old, x)
+end
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index 9c76737e36..1eb881db65 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -83,39 +83,126 @@ the warp.
              Cvoid, Tuple{UInt32}, convert(UInt32, mask))
 end
 
+@inline threadfence(::BlockScope) = threadfence_block()
+@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
+@inline threadfence_sc_block() = @asmcall("fence.sc.cta;", "~{memory}", true, Cvoid, Tuple{})
+@inline threadfence_acq_rel_block() = @asmcall("fence.acq_rel.cta;", "~{memory}", true, Cvoid, Tuple{})
+
+function atomic_thread_fence(order, scope::BlockScope)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+            threadfence_sc_block()
+        elseif order == acquire || order == acq_rel || order == release # || order == consume
+            threadfence_acq_rel_block()
+        else
+            assert(false)
+        end
+    else
+        if order == seq_cst ||
+         # order == consume ||
+           order == acquire ||
+           order == acq_rel ||
+           order == release
+
+            threadfence_block()
+        else
+            assert(false)
+        end
+    end
+end
+
+@inline threadfence(::DeviceScope=device_scope) = threadfence_device()
+@inline threadfence_device() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+@inline threadfence_sc_device() = @asmcall("fence.sc.gpu;", "~{memory}", true, Cvoid, Tuple{})
+@inline threadfence_acq_rel_device() = @asmcall("fence.acq_rel.gpu;", "~{memory}", true, Cvoid, Tuple{})
+
+function atomic_thread_fence(order, scope::DeviceScope=device_scope)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+
+            threadfence_sc_device()
+        elseif order == acquire ||
+             # order == consume ||
+               order == acq_rel ||
+               order == release
+
+            threadfence_acq_rel_device()
+        else
+            assert(false)
+        end
+    else
+        if order == seq_cst() ||
+           order == consume() ||
+           order == acquire() ||
+           order == acq_rel() ||
+           order == release()
+
+            threadfence_device()
+        else
+            assert(false)
+        end
+    end
+end
+
+@inline threadfence(::SystemScope) = threadfence_system()
+@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+@inline threadfence_sc_system() = @asmcall("fence.sc.sys;", "~{memory}", true, Cvoid, Tuple{})
+@inline threadfence_acq_rel_system() = @asmcall("fence.acq_rel.sys;", "~{memory}", true, Cvoid, Tuple{})
+
+function atomic_thread_fence(order, scope::SystemScope)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+
+            threadfence_sc_system()
+        elseif order == acquire ||
+            #  order == consume ||
+               order == acq_rel ||
+               order == release
+
+            threadfence_acq_rel_system()
+        else
+            assert(false)
+        end
+    else
+        if order == seq_cst ||
+         # order == consume ||
+           order == acquire ||
+           order == acq_rel ||
+           order == release
+
+            threadfence_system()
+        else
+            assert(false)
+        end
+    end
+end
+
 """
-    threadfence_block()
+    threadfence(::SyncScope=device_scope)
 
 A memory fence that ensures that:
-- All writes to all memory made by the calling thread before the call to `threadfence_block()`
-  are observed by all threads in the block of the calling thread as occurring before all writes
-  to all memory made by the calling thread after the call to `threadfence_block()`
-- All reads from all memory made by the calling thread before the call to `threadfence_block()`
-  are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`.
-"""
-@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
+- All writes to all memory made by the calling thread before the call to `threadfence(scope)`
+  are observed by all threads in the scope of the calling thread as occurring before all writes
+  to all memory made by the calling thread after the call to `threadfence(scope)`
+- All reads from all memory made by the calling thread before the call to `threadfence(scope)`
+  are ordered before all reads from all memory made by the calling thread after the call to `threadfence(scope)`.
 
-"""
-    threadfence()
+SyncScope can be one of `block_scope`, `device_scope`, or `system_scope`.
+  - `block_scope` orders reads and write on the *same* block.
+  - `device_scope` orders reads and write on the *same* device.
+  - `system_scope` orders reads and writes across all threads in the device,
+    host threads, and all threads in peer devices.
 
-A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the
-calling thread and also ensures that no writes to all memory made by the calling thread after
-the call to `threadfence()` are observed by any thread in the device as occurring before any
-write to all memory made by the calling thread before the call to `threadfence()`.
+See [`atomic_thread_fence`](@ref) for a variant that takes atomic orderings.
 
-Note that for this ordering guarantee to be true, the observing threads must truly observe the
-memory and not cached versions of it; this is requires the use of volatile loads and stores,
-which is not available from Julia right now.
-"""
-@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+!!! note
+  Note that for this ordering guarantee to be true, the observing threads must truly observe the
+  memory and not cached versions of it; this is requires the use of atomic loads and stores.
 
 """
-    threadfence_system()
+function threadfence end
 
-A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the
-calling thread and also ensures that all writes to all memory made by the calling thread
-before the call to `threadfence_system()` are observed by all threads in the device,
-host threads, and all threads in peer devices as occurring before all writes to all
-memory made by the calling thread after the call to `threadfence_system()`.
 """
-@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+    atomic_thread_fence(order::Atomicx.Ordering, ::SyncScope=device)
+"""
+function atomic_thread_fence end
\ No newline at end of file
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index 29810defe7..6507267b35 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -1,5 +1,5 @@
 # TODO: unify with Base.@atomic
-using CUDA: @atomic
+using CUDA: @atomic, @atomicswap, @atomicreplace
 using BFloat16s: BFloat16
 
 @testset "atomics (low-level)" begin

From 871ea08ea7f24f883fedc1e2f4994f38e781ff1e Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 15:51:06 -0500
Subject: [PATCH 03/32] Use cas

---
 src/device/intrinsics/atomics.jl | 122 +++++++++++++++++++++++--------
 1 file changed, 92 insertions(+), 30 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 2021529f14..c4f9d47f40 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -485,6 +485,68 @@ end
         __store_volatile!(ptr, val)
     end
 end
+,
+for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                                        (BlockScope, DeviceScope, SystemScope))
+    asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
+    asm_b32 = "atom.cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
+    @eval @inline __cas_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} =
+        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
+    @eval @inline __cas_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} =
+        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
+end
+
+function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
+    if sizeof(T) == 4
+        __cas_32!(ptr, old, new, order, scope)
+    elseif sizeof(T) == 8
+        __cas_64!(ptr, old, new, order, scope)
+    else
+        assert(false)
+    end
+end
+
+for scope in (Block, Device, System)
+    asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
+    asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
+    @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T =
+        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
+    @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T =
+        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
+end
+
+function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
+    if sizeof(T) == 4
+        __cas__volatile_32!(ptr, old, new, scope)
+    elseif sizeof(T) == 8
+        __cas__volatile_64!(ptr, old, new, scope)
+    else
+        assert(false)
+    end
+end
+
+function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::System=System()) where T
+    order = stronger_order(success_order, failure_order)
+    if compute_capability() >= sv"7.0"
+        if order == seq_cst
+            atomic_thread_fence(seq_cst, scope)
+        end
+        if order == seq_cst # order == consume
+            order = Acquire()
+        end
+        val = __cas!(ptr, old, new, order, scope)
+    else
+        if order == seq_cst || order == acq_rel || order == release
+            atomic_thread_fence(seq_cst, scope)
+        end
+        val = __cas_volatile!(ptr, old, new, scope)
+        if order == seq_cst || order == acq_rel || order == acquire # order == consume
+            atomic_thread_fence(seq_cst, scope)
+        end
+    end
+    success = val == old
+    return (; old, success)
+end
 
 #
 # High-level interface
@@ -508,44 +570,44 @@ end
     atomic_store!(Atomix.pointer(ref), v, order)
 end
 
-@inline function Atomix.replace!(
-    ref::CuIndexableRef,
-    expected,
-    desired,
-    success_ordering,
-    failure_ordering,
-)
-    # TODO success_ordering and failure
+@inline function Atomix.replace!(ref::CuIndexableRef,expected,desired,
+                                 success_ordering,failure_ordering)
     ptr = Atomix.pointer(ref)
     expected = convert(eltype(ref), expected)
     desired = convert(eltype(ref), desired)
-    begin
-        old = atomic_cas!(ptr, expected, desired)
+    return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering)
+end
+
+@inline modify!(ptr, op::OP, x, order) where {OP}
+    success = false
+    while !success
+        expected = atomic_load(ptr, order)
+        new = op(expected, new)
+        old, succss = atomic_cas!(ptr, old, new, order, relaxed)
     end
-    return (; old = old, success = old === expected)
+    return old => new
 end
 
 @inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP}
     x = convert(eltype(ref), x)
     ptr = Atomix.pointer(ref)
-    begin
-        old = if op === (+)
-            atomic_add!(ptr, x)
-        elseif op === (-)
-            atomic_sub!(ptr, x)
-        elseif op === (&)
-            atomic_and!(ptr, x)
-        elseif op === (|)
-            atomic_or!(ptr, x)
-        elseif op === xor
-            atomic_xor!(ptr, x)
-        elseif op === min
-            atomic_min!(ptr, x)
-        elseif op === max
-            atomic_max!(ptr, x)
-        else
-            error("not implemented")
-        end
-    end
+    # TODO: Support hardware variants
+    # old = if op === (+)
+    #     atomic_add!(ptr, x)
+    # elseif op === (-)
+    #     atomic_sub!(ptr, x)
+    # elseif op === (&)
+    #     atomic_and!(ptr, x)
+    # elseif op === (|)
+    #     atomic_or!(ptr, x)
+    # elseif op === xor
+    #     atomic_xor!(ptr, x)
+    # elseif op === min
+    #     atomic_min!(ptr, x)
+    # elseif op === max
+    #     atomic_max!(ptr, x)
+    # else
+        modify!(ptr, op, x, ord)
+    # end
     return old => op(old, x)
 end

From 67c2bcd57793fc0bab1ddff60e1696c3d2d9cf2a Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 16:12:53 -0500
Subject: [PATCH 04/32] fixup! Use cas

---
 src/device/intrinsics/atomics.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index c4f9d47f40..fbcb2106c2 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -485,7 +485,7 @@ end
         __store_volatile!(ptr, val)
     end
 end
-,
+
 for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
                                         (BlockScope, DeviceScope, SystemScope))
     asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
@@ -506,12 +506,12 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
     end
 end
 
-for scope in (Block, Device, System)
+for scope in (BlockScope, DeviceScope, SystemScope)
     asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
     asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
-    @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T =
+    @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
         @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
-    @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where T =
+    @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
         @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
 end
 
@@ -525,7 +525,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     end
 end
 
-function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::System=System()) where T
+function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
     if compute_capability() >= sv"7.0"
         if order == seq_cst
@@ -578,7 +578,7 @@ end
     return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering)
 end
 
-@inline modify!(ptr, op::OP, x, order) where {OP}
+@inline function modify!(ptr, op::OP, x, order) where {OP}
     success = false
     while !success
         expected = atomic_load(ptr, order)

From c00d65b0862000144e9270e86302665b23ca82ee Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 17:13:43 -0500
Subject: [PATCH 05/32] Support load and store of Int8, Int16

---
 src/device/intrinsics/atomics.jl  | 42 +++++++++++++++----------------
 test/device/intrinsics/atomics.jl | 17 ++-----------
 2 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index fbcb2106c2..1a26239a7b 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -386,17 +386,18 @@ end
     end
 end
 
-# Could be done using LLVM.
-@inline __load_volatile_64(ptr::LLVMPtr{T, AS}) where {T, AS} =
-    @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
-@inline __load_volatile_32(ptr::LLVMPtr{T, AS}) where {T, AS} =
-    @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
-
-@inline function __load_volatile(ptr::LLVMPtr{T}) where T
-    if sizeof(T) == 4
-        __load_volatile_32(ptr)
+__supports_atomic(::Type{T}) where T = sizeof(T) == 2 || sizeof(T) ==4
+
+# Could be done using LLVM  
+@inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS}
+    if sizeof(T) == 1
+        @asmcall("ld.volatile.b8  \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+    elseif sizeof(T) == 2
+        @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+    elseif sizeof(T) == 4
+        @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
     elseif sizeof(T) == 8
-        __load_volatile_64(ptr)
+        @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
     else
         assert(false)
     end
@@ -406,7 +407,7 @@ end
     if order == acq_rel || order == release
         assert(false)
     end
-    if compute_capability() >= sv"7.0"
+    if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == monotonic
             val = __load(ptr, monotonic, scope)
             return val
@@ -450,16 +451,15 @@ end
 end
 
 # Could be done using LLVM.
-@inline __store_volatile_32!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} =
-    @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-@inline __store_volatile_64!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS} =
-    @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-
-@inline function __store_volatile!(ptr::LLVMPtr{T}, val::T) where T
-    if sizeof(T) == 4
-        __store_volatile_32!(ptr, val)
+@inline function __store_volatile!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS}
+    if sizeof(T) == 1
+        @asmcall("st.volatile.b8 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+    elseif sizeof(T) == 2
+        @asmcall("st.volatile.b16 [\$0], \$1;", "l,h,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+    elseif sizeof(T) == 4
+        @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
     elseif sizeof(T) == 8
-        __store_volatile_64!(ptr, val)
+        @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
     else
         assert(false)
     end
@@ -469,7 +469,7 @@ end
     if order == acq_rel || order == acquire # || order == consume
         assert(false)
     end
-    if compute_capability() >= sv"7.0"
+    if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == release
             __store!(ptr, val, release, scope)
             return
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index 6507267b35..f31fdd8007 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -417,24 +417,11 @@ end
         @test isnan(Array(a)[1])
     end
 
-    using CUDA: AtomicError
-
-    @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin
-        @atomic a[1] = 1
-    end
-    @test_throws_macro AtomicError("right-hand side of an @atomic assignment should be a call") @macroexpand begin
-        @atomic a[1] = b ? 1 : 2
-    end
-
-    @test_throws_macro AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side") @macroexpand begin
-        @atomic a[1] = a[2] + 1
-    end
-
-    @test_throws_macro AtomicError("unknown @atomic expression") @macroexpand begin
+    @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin
         @atomic wat(a[1])
     end
 
-    @test_throws_macro AtomicError("@atomic should be applied to an array reference expression") @macroexpand begin
+    @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin
         @atomic a = a + 1
     end
 end

From df395755487d9b30e74037dd2c94d81c76f83f82 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 17:32:43 -0500
Subject: [PATCH 06/32] be less stupid

---
 src/device/intrinsics/atomics.jl         | 19 ++++++++++---------
 src/device/intrinsics/synchronization.jl | 12 ++++++------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 1a26239a7b..0bbe1ce974 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -382,11 +382,11 @@ end
     elseif sizeof(T) == 8
         __load_64(ptr, order, scope)
     else
-        assert(false)
+        @assert(false)
     end
 end
 
-__supports_atomic(::Type{T}) where T = sizeof(T) == 2 || sizeof(T) ==4
+__supports_atomic(::Type{T}) where T = sizeof(T) == 4 || sizeof(T) == 8
 
 # Could be done using LLVM  
 @inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS}
@@ -399,13 +399,13 @@ __supports_atomic(::Type{T}) where T = sizeof(T) == 2 || sizeof(T) ==4
     elseif sizeof(T) == 8
         @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
     else
-        assert(false)
+        @assert(false)
     end
 end
 
 @inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == release
-        assert(false)
+        @assert(false)
     end
     if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == monotonic
@@ -446,7 +446,7 @@ end
     elseif sizeof(T) == 8
         __store_64!(ptr, val, order, scope)
     else
-        assert(false)
+        @assert(false)
     end
 end
 
@@ -461,13 +461,13 @@ end
     elseif sizeof(T) == 8
         @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
     else
-        assert(false)
+        @assert(false)
     end
 end
 
 @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == acquire # || order == consume
-        assert(false)
+        @assert(false)
     end
     if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == release
@@ -502,7 +502,7 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
     elseif sizeof(T) == 8
         __cas_64!(ptr, old, new, order, scope)
     else
-        assert(false)
+        @assert(false)
     end
 end
 
@@ -521,7 +521,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     elseif sizeof(T) == 8
         __cas__volatile_64!(ptr, old, new, scope)
     else
-        assert(false)
+        @assert(false)
     end
 end
 
@@ -567,6 +567,7 @@ const CuIndexableRef{Indexable<:CuDeviceArray} = IndexableRef{Indexable}
 end
 
 @inline function Atomix.set!(ref::CuIndexableRef, v, order)
+    v = convert(eltype(ref), v)
     atomic_store!(Atomix.pointer(ref), v, order)
 end
 
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index 1eb881db65..14fe2dbb49 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -95,7 +95,7 @@ function atomic_thread_fence(order, scope::BlockScope)
         elseif order == acquire || order == acq_rel || order == release # || order == consume
             threadfence_acq_rel_block()
         else
-            assert(false)
+            @assert(false)
         end
     else
         if order == seq_cst ||
@@ -106,7 +106,7 @@ function atomic_thread_fence(order, scope::BlockScope)
 
             threadfence_block()
         else
-            assert(false)
+            @assert(false)
         end
     end
 end
@@ -128,7 +128,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope)
 
             threadfence_acq_rel_device()
         else
-            assert(false)
+            @assert(false)
         end
     else
         if order == seq_cst() ||
@@ -139,7 +139,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope)
 
             threadfence_device()
         else
-            assert(false)
+            @assert(false)
         end
     end
 end
@@ -161,7 +161,7 @@ function atomic_thread_fence(order, scope::SystemScope)
 
             threadfence_acq_rel_system()
         else
-            assert(false)
+            @assert(false)
         end
     else
         if order == seq_cst ||
@@ -172,7 +172,7 @@ function atomic_thread_fence(order, scope::SystemScope)
 
             threadfence_system()
         else
-            assert(false)
+            @assert(false)
         end
     end
 end

From 6d58044b98cc15034f79788f949ebe02e795c80d Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 17:47:05 -0500
Subject: [PATCH 07/32] fixup! be less stupid

---
 src/device/intrinsics/atomics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 0bbe1ce974..0daf94fa42 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -608,7 +608,7 @@ end
     # elseif op === max
     #     atomic_max!(ptr, x)
     # else
-        modify!(ptr, op, x, ord)
+        modify!(ptr, op, x, order)
     # end
     return old => op(old, x)
 end

From ce4482d3014528fa8afe6c02395fe42a1f2e0142 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 18:14:46 -0500
Subject: [PATCH 08/32] fix modify! implementation

---
 src/device/intrinsics/atomics.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 0daf94fa42..dd548dd1ac 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -525,7 +525,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     end
 end
 
-function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
+function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
     if compute_capability() >= sv"7.0"
         if order == seq_cst
@@ -534,17 +534,17 @@ function atomic_cas!(ptr::LLVMPtr{T}, old::T, new::T, success_order, failure_ord
         if order == seq_cst # order == consume
             order = Acquire()
         end
-        val = __cas!(ptr, old, new, order, scope)
+        old = __cas!(ptr, expected, new, order, scope)
     else
         if order == seq_cst || order == acq_rel || order == release
             atomic_thread_fence(seq_cst, scope)
         end
-        val = __cas_volatile!(ptr, old, new, scope)
+        old = __cas_volatile!(ptr, expected, new, scope)
         if order == seq_cst || order == acq_rel || order == acquire # order == consume
             atomic_thread_fence(seq_cst, scope)
         end
     end
-    success = val == old
+    success = expected == old
     return (; old, success)
 end
 
@@ -581,12 +581,12 @@ end
 
 @inline function modify!(ptr, op::OP, x, order) where {OP}
     success = false
+    expected = atomic_load(ptr, order)
     while !success
-        expected = atomic_load(ptr, order)
-        new = op(expected, new)
-        old, succss = atomic_cas!(ptr, old, new, order, relaxed)
+        new = op(expected, x)
+        expected, success = atomic_cas!(ptr, expected, new, order, relaxed)
     end
-    return old => new
+    return expected => new
 end
 
 @inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP}

From b4fd958093ec0c58b9d13c37f51f3f61e7776c0e Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 18:25:54 -0500
Subject: [PATCH 09/32] fixup! fix modify! implementation

---
 src/device/intrinsics/atomics.jl | 33 ++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index dd548dd1ac..c811c31581 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -486,6 +486,34 @@ end
     end
 end
 
+order(::LLVMOrdering{:monotonic}) = 1
+# order(::Consume) = 2
+order(::LLVMOrdering{:acquire}) = 3
+order(::LLVMOrdering{:release}) = 4
+order(::LLVMOrdering{:acq_rel}) = 5
+order(::LLVMOrdering{:seq_cst}) = 6
+
+Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b))
+
+function stronger_order(a::LLVMOrdering, b::LLVMOrdering)
+    m = max(a, b)
+    if m != release
+        return m
+    end
+    # maximum is release, what is the other one?
+    other = min(a, b)
+    if other == monotonic
+        return release
+    # elseif other == Consume()
+    #     return Acq_Rel()
+    elseif other == acquire
+        return acq_rel
+    elseif other == release
+        return release
+    end
+    @assert(false)
+end
+
 for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
                                         (BlockScope, DeviceScope, SystemScope))
     asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
@@ -506,6 +534,7 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
     end
 end
 
+# TODO: Volatile cas for 16/8
 for scope in (BlockScope, DeviceScope, SystemScope)
     asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
     asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
@@ -532,7 +561,7 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur
             atomic_thread_fence(seq_cst, scope)
         end
         if order == seq_cst # order == consume
-            order = Acquire()
+            order = acquire
         end
         old = __cas!(ptr, expected, new, order, scope)
     else
@@ -584,7 +613,7 @@ end
     expected = atomic_load(ptr, order)
     while !success
         new = op(expected, x)
-        expected, success = atomic_cas!(ptr, expected, new, order, relaxed)
+        expected, success = atomic_cas!(ptr, expected, new, order, monotonic)
     end
     return expected => new
 end

From e22f8f6b2752a594b5dc080cb12a1826e4d37ed3 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 21:20:05 -0500
Subject: [PATCH 10/32] fix atomic usage in linalg.jl

---
 lib/cublas/linalg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cublas/linalg.jl b/lib/cublas/linalg.jl
index ead34d4262..5779bc7c22 100644
--- a/lib/cublas/linalg.jl
+++ b/lib/cublas/linalg.jl
@@ -44,7 +44,7 @@ function LinearAlgebra.dot(x::AnyCuArray{T1}, y::AnyCuArray{T2}) where {T1,T2}
         val = CUDA.reduce_block(+, local_val, zero(T), shuffle)
         if threadIdx().x == 1i32
             # NOTE: introduces nondeterminism
-            @inbounds CUDA.@atomic res[] += val
+            @inbounds CUDA.@atomic res[1i32] += val
         end
 
         return

From ecef6921c7bcf58b913d20bb464506797b13fb13 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 21:26:48 -0500
Subject: [PATCH 11/32] fixup! fix atomic usage in linalg.jl

---
 src/device/intrinsics/atomics.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index c811c31581..13304bfc25 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -371,9 +371,9 @@ for (order, scope) in Iterators.product((LLVMOrdering{:acquire}, LLVMOrdering{:m
     asm_b64 = "ld.$(asm(order)).$(asm(scope)).b64 \$0, [\$1];"
     asm_b32 = "ld.$(asm(order)).$(asm(scope)).b32 \$0, [\$1];"
     @eval @inline __load_64(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr)
+        @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
     @eval @inline __load_32(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T}}, ptr)
+        @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
 end
 
 @inline function __load(ptr::LLVMPtr{T}, order, scope) where T
@@ -600,8 +600,8 @@ end
     atomic_store!(Atomix.pointer(ref), v, order)
 end
 
-@inline function Atomix.replace!(ref::CuIndexableRef,expected,desired,
-                                 success_ordering,failure_ordering)
+@inline function Atomix.replace!(ref::CuIndexableRef, expected, desired,
+                                 success_ordering, failure_ordering)
     ptr = Atomix.pointer(ref)
     expected = convert(eltype(ref), expected)
     desired = convert(eltype(ref), desired)

From 7ae6ccaa2ff6fb5e6b50f67b2847a98c1fc09024 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 21:41:05 -0500
Subject: [PATCH 12/32] add error for SM_60

---
 src/device/intrinsics/atomics.jl | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 13304bfc25..ae4dd6181a 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -417,7 +417,7 @@ end
         end
         val = __load(ptr, acquire, scope)
         return val
-    else
+    elseif compute_capability() >= sv"6.0"
         if order == seq_cst
             atomic_thread_fence(seq_cst, scope)
         end
@@ -427,6 +427,8 @@ end
         end
         atomic_thread_fence(order, scope)
         return val
+    else
+        error("Atomics are only supported on SM_60")
     end
 end
 
@@ -478,11 +480,13 @@ end
             atomic_thread_fence(seq_cst, scope)
         end
         __store!(ptr, val, monotonic, scope)
-    else
+    elseif compute_capability() >= sv"6.0"
         if order == seq_cst
             atomic_thread_fence(seq_cst, scope)
         end
         __store_volatile!(ptr, val)
+    else
+        error("Atomics are only supported on SM_60")
     end
 end
 
@@ -556,7 +560,7 @@ end
 
 function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
-    if compute_capability() >= sv"7.0"
+    if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == seq_cst
             atomic_thread_fence(seq_cst, scope)
         end
@@ -564,7 +568,7 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur
             order = acquire
         end
         old = __cas!(ptr, expected, new, order, scope)
-    else
+    elseif compute_capability() >= sv"6.0"
         if order == seq_cst || order == acq_rel || order == release
             atomic_thread_fence(seq_cst, scope)
         end
@@ -572,6 +576,8 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur
         if order == seq_cst || order == acq_rel || order == acquire # order == consume
             atomic_thread_fence(seq_cst, scope)
         end
+    else
+        error("Atomics are only supported on SM_60")
     end
     success = expected == old
     return (; old, success)

From 80c6ab6049790f473b2d082a19c25586e0482a6b Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 21:46:39 -0500
Subject: [PATCH 13/32] Fixup modify

---
 src/device/intrinsics/atomics.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index ae4dd6181a..c123e324d1 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -550,9 +550,9 @@ end
 
 function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     if sizeof(T) == 4
-        __cas__volatile_32!(ptr, old, new, scope)
+        __cas_volatile_32!(ptr, old, new, scope)
     elseif sizeof(T) == 8
-        __cas__volatile_64!(ptr, old, new, scope)
+        __cas_volatile_64!(ptr, old, new, scope)
     else
         @assert(false)
     end
@@ -614,9 +614,10 @@ end
     return atomic_cas!(ptr, expected, desired, success_ordering, failure_ordering)
 end
 
-@inline function modify!(ptr, op::OP, x, order) where {OP}
+@inline function modify!(ptr::LLVMPtr{T}, op::OP, x, order) where {T, OP}
     success = false
     expected = atomic_load(ptr, order)
+    local new::T
     while !success
         new = op(expected, x)
         expected, success = atomic_cas!(ptr, expected, new, order, monotonic)
@@ -643,7 +644,7 @@ end
     # elseif op === max
     #     atomic_max!(ptr, x)
     # else
-        modify!(ptr, op, x, order)
+        return modify!(ptr, op, x, order)
     # end
-    return old => op(old, x)
+    # return old => op(old, x)
 end

From aa6258688eb78496af97cdb7445f45bc33e23879 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Mar 2023 22:11:57 -0500
Subject: [PATCH 14/32] skip shmem for now

---
 test/device/intrinsics/atomics.jl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index f31fdd8007..dae88ccdd7 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -193,15 +193,15 @@ end
     end
 end
 
-@testset "shared memory" begin
-    function kernel()
-        shared = CuStaticSharedArray(Float32, 1)
-        @atomic shared[threadIdx().x] += 0f0
-        return
-    end
-
-    CUDA.@sync @cuda kernel()
-end
+# @testset "shared memory" begin
+#     function kernel()
+#         shared = CuStaticSharedArray(Float32, 1)
+#         @atomic shared[threadIdx().x] += 0f0
+#         return
+#     end
+
+#     CUDA.@sync @cuda kernel()
+# end
 
 end
 

From 6325ee9ee33f97e33a4800a6377fa3ca8a133a69 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 11 Mar 2023 09:55:55 -0500
Subject: [PATCH 15/32] fix volatile load

---
 src/device/intrinsics.jl                 |  6 +++++
 src/device/intrinsics/atomics.jl         | 30 ++++++++++++++----------
 src/device/intrinsics/synchronization.jl | 12 +++++-----
 test/device/intrinsics/atomics.jl        | 25 ++++++++++++++++++++
 4 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl
index 30e797cbd7..1bf3887b62 100644
--- a/src/device/intrinsics.jl
+++ b/src/device/intrinsics.jl
@@ -15,6 +15,12 @@ const block_scope = BlockScope()
 import UnsafeAtomics
 using UnsafeAtomics.Internal: LLVMOrdering
 using UnsafeAtomics: unordered, monotonic, acquire, release, acq_rel, seq_cst
+
+struct AtomicUnsupported{T} <: Exception end
+struct AtomicOrderUnsupported{Ordering} <: Exception
+    order::Ordering
+end
+
 # Note CUDA C++ has also consume ordering which LLVM does not support
 # monotonic -> relaxed
 # unordered -> ??? maybe weak
diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index c123e324d1..ff3ea6a49d 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -382,30 +382,33 @@ end
     elseif sizeof(T) == 8
         __load_64(ptr, order, scope)
     else
-        @assert(false)
+        throw(AtomicUnsupported{T}())
     end
 end
 
 __supports_atomic(::Type{T}) where T = sizeof(T) == 4 || sizeof(T) == 8
 
-# Could be done using LLVM  
+# Could be done using LLVM
+# TODO: Register choice for Float32/Float64
 @inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS}
     if sizeof(T) == 1
-        @asmcall("ld.volatile.b8  \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+        val = @asmcall("ld.volatile.b8  \$0, [\$1];", "=r,l,~{memory}", true, UInt32, Tuple{LLVMPtr{T, AS}}, ptr)
+        return val % T
     elseif sizeof(T) == 2
-        @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+        val = @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, UInt16, Tuple{LLVMPtr{T, AS}}, ptr)
+        return Core.bitcast(T, val) # Float16 otherwise complaints
     elseif sizeof(T) == 4
-        @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+        @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
     elseif sizeof(T) == 8
-        @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}})
+        @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
     else
-        @assert(false)
+        throw(AtomicUnsupported{T}())
     end
 end
 
 @inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == release
-        @assert(false)
+        throw(AtomicOrderUnsupported(order))
     end
     if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == monotonic
@@ -448,7 +451,7 @@ end
     elseif sizeof(T) == 8
         __store_64!(ptr, val, order, scope)
     else
-        @assert(false)
+        throw(AtomicUnsupported{T}())
     end
 end
 
@@ -463,13 +466,13 @@ end
     elseif sizeof(T) == 8
         @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
     else
-        @assert(false)
+        throw(AtomicUnsupported{T}())
     end
 end
 
 @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == acquire # || order == consume
-        @assert(false)
+        throw(AtomicOrderUnsupported(order))
     end
     if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == release
@@ -515,6 +518,7 @@ function stronger_order(a::LLVMOrdering, b::LLVMOrdering)
     elseif other == release
         return release
     end
+    Base.llvmcall("unreachable", Cvoid, Tuple{})
     @assert(false)
 end
 
@@ -534,7 +538,7 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
     elseif sizeof(T) == 8
         __cas_64!(ptr, old, new, order, scope)
     else
-        @assert(false)
+        throw(AtomicUnsupported{T}())
     end
 end
 
@@ -554,7 +558,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     elseif sizeof(T) == 8
         __cas_volatile_64!(ptr, old, new, scope)
     else
-        @assert(false)
+        throw(AtomicUnsupported{T}())
     end
 end
 
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index 14fe2dbb49..e98baa16a2 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -95,7 +95,7 @@ function atomic_thread_fence(order, scope::BlockScope)
         elseif order == acquire || order == acq_rel || order == release # || order == consume
             threadfence_acq_rel_block()
         else
-            @assert(false)
+            throw(AtomicOrderUnsupported(order))
         end
     else
         if order == seq_cst ||
@@ -106,7 +106,7 @@ function atomic_thread_fence(order, scope::BlockScope)
 
             threadfence_block()
         else
-            @assert(false)
+            throw(AtomicOrderUnsupported(order))
         end
     end
 end
@@ -128,7 +128,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope)
 
             threadfence_acq_rel_device()
         else
-            @assert(false)
+            throw(AtomicOrderUnsupported(order))
         end
     else
         if order == seq_cst() ||
@@ -139,7 +139,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope)
 
             threadfence_device()
         else
-            @assert(false)
+            throw(AtomicOrderUnsupported(order))
         end
     end
 end
@@ -161,7 +161,7 @@ function atomic_thread_fence(order, scope::SystemScope)
 
             threadfence_acq_rel_system()
         else
-            @assert(false)
+            throw(AtomicOrderUnsupported(order))
         end
     else
         if order == seq_cst ||
@@ -172,7 +172,7 @@ function atomic_thread_fence(order, scope::SystemScope)
 
             threadfence_system()
         else
-            @assert(false)
+            throw(AtomicOrderUnsupported(order))
         end
     end
 end
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index dae88ccdd7..178489fd31 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -2,6 +2,31 @@
 using CUDA: @atomic, @atomicswap, @atomicreplace
 using BFloat16s: BFloat16
 
+@testset "atomics (low-level) with order" begin
+
+@testset "atomic_load" begin
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        # TODO unordered
+        supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
+        unsupported_orders = [CUDA.release, CUDA.acq_rel]
+
+        function kernel(a, order, scope)
+            CUDA.atomic_load(pointer(a), order, scope)
+            return
+        end
+
+        for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, order, scope)
+        end
+    end
+end
+end # atomics (low-level) with order
+
 @testset "atomics (low-level)" begin
 
 # tested on all natively-supported atomics

From c7c6f3341e670279857709a545a23a44ba48d7fc Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 11 Mar 2023 18:28:18 -0500
Subject: [PATCH 16/32] add more low-level tests

---
 src/device/intrinsics/atomics.jl  |  1 -
 test/device/intrinsics/atomics.jl | 45 +++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index ff3ea6a49d..74a97eda9f 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -542,7 +542,6 @@ function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
     end
 end
 
-# TODO: Volatile cas for 16/8
 for scope in (BlockScope, DeviceScope, SystemScope)
     asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
     asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index 178489fd31..655eb81de0 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -25,6 +25,51 @@ using BFloat16s: BFloat16
         end
     end
 end
+
+@testset "atomic_store!" begin
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        # TODO unordered
+        supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
+        unsupported_orders = [CUDA.acquire, CUDA.acq_rel]
+
+        function kernel(a, val, order, scope)
+            CUDA.atomic_store!(pointer(a), val, order, scope)
+            return
+        end
+
+        for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, one(T), order, scope)
+        end
+    end
+end
+
+@testset "atomic_cas!" begin
+    if capability(device()) >= v"6.0"
+        # TODO size(T) in (1, 2)
+        types = [Int32, Int64, 
+                 UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        # TODO unordered
+        orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel]
+
+        function kernel(a, expected, desired, success_order, failure_order, scope)
+            CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope)
+            return
+        end
+
+        for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope)
+        end
+    end
+end
+
 end # atomics (low-level) with order
 
 @testset "atomics (low-level)" begin

From 6192f44e75576c8be4ea1dc2822450a95fe8ab12 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 11 Mar 2023 19:55:00 -0500
Subject: [PATCH 17/32] cleanup and use egal

---
 src/device/intrinsics/atomics.jl           |  14 +-
 test/device/intrinsics/atomics.jl          | 573 +++++----------------
 test/device/intrinsics/lowlevel_atomics.jl | 308 +++++++++++
 3 files changed, 432 insertions(+), 463 deletions(-)
 create mode 100644 test/device/intrinsics/lowlevel_atomics.jl

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 74a97eda9f..d2e05c1f8c 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -502,7 +502,7 @@ order(::LLVMOrdering{:seq_cst}) = 6
 
 Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b))
 
-function stronger_order(a::LLVMOrdering, b::LLVMOrdering)
+@inline function stronger_order(a::LLVMOrdering, b::LLVMOrdering)
     m = max(a, b)
     if m != release
         return m
@@ -532,7 +532,7 @@ for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:a
         @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
 end
 
-function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
+@inline function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
     if sizeof(T) == 4
         __cas_32!(ptr, old, new, order, scope)
     elseif sizeof(T) == 8
@@ -545,13 +545,13 @@ end
 for scope in (BlockScope, DeviceScope, SystemScope)
     asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
     asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
-    @eval __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
+    @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
         @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
-    @eval __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
+    @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
         @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
 end
 
-function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
+@inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     if sizeof(T) == 4
         __cas_volatile_32!(ptr, old, new, scope)
     elseif sizeof(T) == 8
@@ -561,7 +561,7 @@ function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
     end
 end
 
-function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
+@inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
     if compute_capability() >= sv"7.0" && __supports_atomic(T)
         if order == seq_cst
@@ -582,7 +582,7 @@ function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failur
     else
         error("Atomics are only supported on SM_60")
     end
-    success = expected == old
+    success = expected === old # egal since otherwise NaN's won't work.
     return (; old, success)
 end
 
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
index 655eb81de0..da331c0474 100644
--- a/test/device/intrinsics/atomics.jl
+++ b/test/device/intrinsics/atomics.jl
@@ -1,480 +1,182 @@
-# TODO: unify with Base.@atomic
 using CUDA: @atomic, @atomicswap, @atomicreplace
-using BFloat16s: BFloat16
-
-@testset "atomics (low-level) with order" begin
-
-@testset "atomic_load" begin
-    if capability(device()) >= v"6.0"
-        types = [Int8, Int16, Int32, Int64, 
-                 UInt8, UInt16, UInt32, UInt64,
-                 Float64, Float32]
-        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
-        # TODO unordered
-        supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
-        unsupported_orders = [CUDA.release, CUDA.acq_rel]
-
-        function kernel(a, order, scope)
-            CUDA.atomic_load(pointer(a), order, scope)
-            return
-        end
-
-        for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
-            a = CuArray(T[0])
-            @cuda threads=1 kernel(a, order, scope)
-        end
-    end
-end
-
-@testset "atomic_store!" begin
-    if capability(device()) >= v"6.0"
-        types = [Int8, Int16, Int32, Int64, 
-                 UInt8, UInt16, UInt32, UInt64,
-                 Float64, Float32]
-        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
-        # TODO unordered
-        supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
-        unsupported_orders = [CUDA.acquire, CUDA.acq_rel]
-
-        function kernel(a, val, order, scope)
-            CUDA.atomic_store!(pointer(a), val, order, scope)
-            return
-        end
-
-        for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
-            a = CuArray(T[0])
-            @cuda threads=1 kernel(a, one(T), order, scope)
-        end
-    end
-end
-
-@testset "atomic_cas!" begin
-    if capability(device()) >= v"6.0"
-        # TODO size(T) in (1, 2)
-        types = [Int32, Int64, 
-                 UInt32, UInt64,
-                 Float64, Float32]
-        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
-        # TODO unordered
-        orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel]
-
-        function kernel(a, expected, desired, success_order, failure_order, scope)
-            CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope)
-            return
-        end
-
-        for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes)
-            a = CuArray(T[0])
-            @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope)
-        end
-    end
-end
-
-end # atomics (low-level) with order
-
-@testset "atomics (low-level)" begin
-
-# tested on all natively-supported atomics
-
-@testset "atomic_add" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32]
-    capability(device()) >= v"6.0" && push!(types, Float64)
-    capability(device()) >= v"7.0" && push!(types, Float16)
-
-    @testset for T in types
-        a = CuArray(T[0])
-
-        function kernel(a, b)
-            CUDA.atomic_add!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == 1024
-    end
-end
-
-@testset "atomic_sub" begin
-    types = [Int32, Int64, UInt32, UInt64]
-
-    @testset for T in types
-        a = CuArray(T[2048])
-
-        function kernel(a, b)
-            CUDA.atomic_sub!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == 1024
-    end
-end
-
-@testset "atomic_inc" begin
-    @testset for T in [Int32]
-        a = CuArray(T[0])
-
-        function kernel(a, b)
-            CUDA.atomic_inc!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=768 kernel(a, T(512))
-        @test Array(a)[1] == 255
-    end
-end
-
-@testset "atomic_dec" begin
-    @testset for T in [Int32]
-        a = CuArray(T[1024])
-
-        function kernel(a, b)
-            CUDA.atomic_dec!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=256 kernel(a, T(512))
-        @test Array(a)[1] == 257
-    end
-end
-
-@testset "atomic_xchg" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray([zero(T)])
-
-        function kernel(a, b)
-            CUDA.atomic_xchg!(pointer(a), b)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == one(T)
-    end
-end
-
-@testset "atomic_and" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray(T[1023])
-
-        function kernel(a, T)
-            i = threadIdx().x - 1
-            k = 1
-            for i = 1:i
-                k *= 2
-            end
-            b = 1023 - k  # 1023 - 2^i
-            CUDA.atomic_and!(pointer(a), T(b))
-            return
-        end
-
-        @cuda threads=10 kernel(a, T)
-        @test Array(a)[1] == zero(T)
-    end
-end
-
-@testset "atomic_or" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray(T[0])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            b = 1  # 2^(i-1)
-            for i = 1:i
-                b *= 2
-            end
-            b /= 2
-            CUDA.atomic_or!(pointer(a), T(b))
-            return
-        end
-
-        @cuda threads=10 kernel(a, T)
-        @test Array(a)[1] == 1023
-    end
-end
-
-@testset "atomic_xor" begin
-    @testset for T in [Int32, Int64, UInt32, UInt64]
-        a = CuArray(T[1023])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            b = 1  # 2^(i-1)
-            for i = 1:i
-                b *= 2
-            end
-            b /= 2
-            CUDA.atomic_xor!(pointer(a), T(b))
-            return
-        end
-
-        @cuda threads=10 kernel(a, T)
-        @test Array(a)[1] == 0
-    end
-end
-
-@testset "atomic_cas" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16])
-
-    @testset for T in types
-        a = CuArray(T[0])
-
-        function kernel(a, b, c)
-            CUDA.atomic_cas!(pointer(a), b, c)
-            return
-        end
-
-        @cuda threads=1024 kernel(a, zero(T), one(T))
-        @test Array(a)[1] == 1
-    end
-end
-
-@testset "atomic_max" begin
-    types = [Int32, Int64, UInt32, UInt64]
-
-    @testset for T in types
-        a = CuArray([zero(T)])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            CUDA.atomic_max!(pointer(a), T(i))
-            return
-        end
-
-        @cuda threads=1024 kernel(a, T)
-        @test Array(a)[1] == 1024
-    end
-end
-
-@testset "atomic_min" begin
-    types = [Int32, Int64, UInt32, UInt64]
-
-    @testset for T in types
-        a = CuArray(T[1024])
-
-        function kernel(a, T)
-            i = threadIdx().x
-            CUDA.atomic_min!(pointer(a), T(i))
-            return
-        end
-
-        @cuda threads=1024 kernel(a, T)
-        @test Array(a)[1] == 1
-    end
-end
-
-# @testset "shared memory" begin
-#     function kernel()
-#         shared = CuStaticSharedArray(Float32, 1)
-#         @atomic shared[threadIdx().x] += 0f0
-#         return
-#     end
-
-#     CUDA.@sync @cuda kernel()
-# end
-
-end
 
 @testset "atomics (high-level)" begin
-
-# tested on all types supported by atomic_cas! (which empowers the fallback definition)
-
-@testset "add" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray([zero(T)])
-
+    # tested on all types supported by atomic_cas! (which empowers the fallback definition)
+    
+    @testset "add" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] + 1
             @atomic a[1] += 1
             return
         end
 
-        @cuda threads=1024 kernel(T, a)
-        @test Array(a)[1] == 2048
+        @testset for T in types
+            a = CuArray([zero(T)])    
+            @cuda threads=1024 kernel(T, a)
+            @test Array(a)[1] == 1024
+        end
     end
-end
-
-@testset "sub" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray(T[2048])
-
+    
+    @testset "sub" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] - 1
             @atomic a[1] -= 1
             return
         end
 
-        @cuda threads=1024 kernel(T, a)
-        @test Array(a)[1] == 0
+        @testset for T in types
+            a = CuArray(T[2048])    
+            @cuda threads=1024 kernel(T, a)
+            @test Array(a)[1] == 1024
+        end
     end
-end
-
-@testset "mul" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray(T[1])
-
+    
+    @testset "mul" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] * 2
             @atomic a[1] *= 2
             return
         end
 
-        @cuda threads=5 kernel(T, a)
-        @test Array(a)[1] == 1024
+        @testset for T in types
+            a = CuArray(T[1])
+            @cuda threads=5 kernel(T, a)
+            @test Array(a)[1] == 32
+        end
     end
-end
-
-@testset "div" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray(T[1024])
-
+    
+    @testset "div" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
-            @atomic a[1] = a[1] / 2
             @atomic a[1] /= 2
             return
         end
 
-        @cuda threads=5 kernel(T, a)
-        @test Array(a)[1] == 1
+        @testset for T in types
+            a = CuArray(T[32])    
+            @cuda threads=5 kernel(T, a)
+            @test Array(a)[1] == 1
+        end
     end
-end
-
-@testset "and" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([~zero(T), ~zero(T)])
-
+    
+    @testset "and" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
+    
         function kernel(T, a)
             i = threadIdx().x
             mask = ~(T(1) << (i-1))
-            @atomic a[1] = a[1] & mask
-            @atomic a[2] &= mask
+            @atomic a[1] &= mask
             return
         end
-
-        @cuda threads=8*sizeof(T) kernel(T, a)
-        @test Array(a)[1] == zero(T)
-        @test Array(a)[2] == zero(T)
+            
+        @testset for T in types
+            a = CuArray([~zero(T)])    
+            @cuda threads=8*sizeof(T) kernel(T, a)
+            @test Array(a)[1] == zero(T)
+        end
     end
-end
-
-@testset "or" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([zero(T), zero(T)])
+    
+    @testset "or" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
 
         function kernel(T, a)
             i = threadIdx().x
             mask = T(1) << (i-1)
-            @atomic a[1] = a[1] | mask
-            @atomic a[2] |= mask
+            @atomic a[1] |= mask
             return
         end
 
-        @cuda threads=8*sizeof(T) kernel(T, a)
-        @test Array(a)[1] == ~zero(T)
-        @test Array(a)[2] == ~zero(T)
+        @testset for T in types
+            a = CuArray([zero(T)])    
+            @cuda threads=8*sizeof(T) kernel(T, a)
+            @test Array(a)[1] == ~zero(T)
+        end
     end
-end
-
-@testset "xor" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([zero(T), zero(T)])
-
+    
+    @testset "xor" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
+    
         function kernel(T, a)
             i = threadIdx().x
             mask = T(1) << ((i-1)%(8*sizeof(T)))
-            @atomic a[1] = a[1] ⊻ mask
-            @atomic a[2] ⊻= mask
+            @atomic a[1] ⊻= mask
             return
         end
 
-        nb = 4
-        @cuda threads=(8*sizeof(T)+nb) kernel(T, a)
-        @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T))
-        @test Array(a)[2] == ~zero(T) & ~((one(T) << nb) - one(T))
+        @testset for T in types
+            a = CuArray([zero(T)])
+            nb = 4
+            @cuda threads=(8*sizeof(T)+nb) kernel(T, a)
+            @test Array(a)[1] == ~zero(T) & ~((one(T) << nb) - one(T))
+        end
     end
-end
-
-@testset "max" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray([zero(T)])
-
+    
+    @testset "max" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
             i = threadIdx().x
-            @atomic a[1] = max(a[1], i)
+            @atomic a[1] max i
             return
         end
 
-        @cuda threads=32 kernel(T, a)
-        @test Array(a)[1] == 32
+        @testset for T in types
+            a = CuArray([zero(T)])    
+            @cuda threads=32 kernel(T, a)
+            @test Array(a)[1] == 32
+        end
     end
-end
-
-@testset "min" begin
-    types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
-
-    @testset for T in types
-        a = CuArray([typemax(T)])
-
+    
+    @testset "min" begin
+        types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    
         function kernel(T, a)
             i = threadIdx().x
-            @atomic a[1] = min(a[1], i)
+            @atomic a[1] min i
             return
         end
 
-        @cuda threads=32 kernel(T, a)
-        @test Array(a)[1] == 1
+        @testset for T in types
+            a = CuArray([typemax(T)])
+            @cuda threads=32 kernel(T, a)
+            @test Array(a)[1] == 1
+        end
     end
-end
-
-@testset "shift" begin
-    types = [Int32, Int64, UInt32, UInt64]
-    capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
-
-    @testset for T in types
-        a = CuArray([one(T)])
-
+    
+    @testset "shift" begin
+        types = [Int32, Int64, UInt32, UInt64]
+        # capability(device()) >= v"7.0" && append!(types, [Int16, UInt16])
+    
         function kernel(T, a)
             @atomic a[1] <<= 1
             return
         end
 
-        @cuda threads=8 kernel(T, a)
-        @test Array(a)[1] == 1<<8
+        @testset for T in types
+            a = CuArray([one(T)])    
+            @cuda threads=8 kernel(T, a)
+            @test Array(a)[1] == 1<<8
+        end
     end
-end
-
-@testset "macro" begin
-
+    
     @testset "NaN" begin
         f(x,y) = 3x + 2y
 
         function kernel(x)
-            CUDA.@atomic x[1] = f(x[1],42f0)
+            @inbounds CUDA.@atomic x[1] f 42f0
             nothing
         end
 
@@ -487,56 +189,15 @@ end
         @test isnan(Array(a)[1])
     end
 
-    @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin
-        @atomic wat(a[1])
-    end
-
-    @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin
-        @atomic a = a + 1
-    end
-end
-
-@testset "shared memory" begin
-    # test that atomic operations on shared memory work
-    # https://github.com/JuliaGPU/CUDA.jl/issues/311
-
-    function kernel(a)
-        b = CUDA.CuStaticSharedArray(Int, 1)
-
-        if threadIdx().x == 1
-            b[] = a[]
+    @testset "macro" begin
+        @test_throws_macro ErrorException("could not parse @atomic expression wat(a[1])") @macroexpand begin
+            @atomic wat(a[1])
         end
-        sync_threads()
-
-        CUDA.atomic_add!(pointer(b), 1)
-        sync_threads()
-
-        if threadIdx().x == 1
-            a[] = b[]
+    
+        @test_throws_macro ErrorException("@atomic modify expression missing field access") @macroexpand begin
+            @atomic a = a + 1
         end
-        return
     end
-
-    a = CuArray([0])
-    @cuda threads=16 kernel(a)
-    @test Array(a) == [16]
-end
-
-@testset "shared memory bug" begin
-    # shared memory atomics resulted in illegal memory accesses
-    # https://github.com/JuliaGPU/CUDA.jl/issues/558
-
-    function kernel()
-        tid = threadIdx().x
-        shared = CuStaticSharedArray(Float32, 4)
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-        sync_threads()
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-        return
-    end
-
-    @cuda threads=2 kernel()
-    synchronize()
-end
-
+    
 end
+    
\ No newline at end of file
diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl
new file mode 100644
index 0000000000..7122cd984f
--- /dev/null
+++ b/test/device/intrinsics/lowlevel_atomics.jl
@@ -0,0 +1,308 @@
+using BFloat16s: BFloat16
+
+@testset "atomics (low-level) with order" begin
+
+@testset "atomic_load" begin
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        # TODO unordered
+        supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
+        unsupported_orders = [CUDA.release, CUDA.acq_rel]
+
+        function kernel(a, order, scope)
+            CUDA.atomic_load(pointer(a), order, scope)
+            return
+        end
+
+        @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, order, scope)
+            @test Array(a)[1] == 0
+        end
+    end
+end
+
+@testset "atomic_store!" begin
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        # TODO unordered
+        supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
+        unsupported_orders = [CUDA.acquire, CUDA.acq_rel]
+
+        function kernel(a, val, order, scope)
+            CUDA.atomic_store!(pointer(a), val, order, scope)
+            return
+        end
+
+        @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, one(T), order, scope)
+            @test Array(a)[1] == one(T)
+        end
+    end
+end
+
+@testset "atomic_cas!" begin
+    if capability(device()) >= v"6.0"
+        # TODO size(T) in (1, 2)
+        types = [Int32, Int64, 
+                 UInt32, UInt64,
+                 Float64, Float32]
+        scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
+        # TODO unordered
+        orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel]
+
+        function kernel(a, expected, desired, success_order, failure_order, scope)
+            CUDA.atomic_cas!(pointer(a), expected, desired, success_order, failure_order, scope)
+            return
+        end
+
+        @testset for (T, success_order, failure_order, scope) in Iterators.product(types, orders, orders, scopes)
+            a = CuArray(T[0])
+            @cuda threads=1 kernel(a, zero(T), one(T), success_order, failure_order, scope)
+            @test Array(a)[1] == one(T)
+        end
+    end
+end
+
+end # atomics (low-level) with order
+
+@testset "atomics (low-level)" begin
+
+# tested on all natively-supported atomics
+
+@testset "atomic_add" begin
+    types = [Int32, Int64, UInt32, UInt64, Float32]
+    capability(device()) >= v"6.0" && push!(types, Float64)
+    capability(device()) >= v"7.0" && push!(types, Float16)
+
+    function kernel(a, b)
+        CUDA.atomic_add!(pointer(a), b)
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[0])
+
+        @cuda threads=1024 kernel(a, one(T))
+        @test Array(a)[1] == 1024
+    end
+end
+
+@testset "atomic_sub" begin
+    types = [Int32, Int64, UInt32, UInt64]
+
+    function kernel(a, b)
+        CUDA.atomic_sub!(pointer(a), b)
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[2048])
+        @cuda threads=1024 kernel(a, one(T))
+        @test Array(a)[1] == 1024
+    end
+end
+
+@testset "atomic_inc" begin
+    function kernel(a, b)
+        CUDA.atomic_inc!(pointer(a), b)
+        return
+    end
+
+    @testset for T in [Int32]
+        a = CuArray(T[0])
+        @cuda threads=768 kernel(a, T(512))
+        @test Array(a)[1] == 255
+    end
+end
+
+@testset "atomic_dec" begin
+    function kernel(a, b)
+        CUDA.atomic_dec!(pointer(a), b)
+        return
+    end
+
+    @testset for T in [Int32]
+        a = CuArray(T[1024])
+        @cuda threads=256 kernel(a, T(512))
+        @test Array(a)[1] == 257
+    end
+end
+
+@testset "atomic_xchg" begin
+    function kernel(a, b)
+        CUDA.atomic_xchg!(pointer(a), b)
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray([zero(T)])
+        @cuda threads=1024 kernel(a, one(T))
+        @test Array(a)[1] == one(T)
+    end
+end
+
+@testset "atomic_and" begin
+    function kernel(a, T)
+        i = threadIdx().x - 1
+        k = 1
+        for i = 1:i
+            k *= 2
+        end
+        b = 1023 - k  # 1023 - 2^i
+        CUDA.atomic_and!(pointer(a), T(b))
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray(T[1023])
+        @cuda threads=10 kernel(a, T)
+        @test Array(a)[1] == zero(T)
+    end
+end
+
+@testset "atomic_or" begin
+    function kernel(a, T)
+        i = threadIdx().x
+        b = 1  # 2^(i-1)
+        for i = 1:i
+            b *= 2
+        end
+        b /= 2
+        CUDA.atomic_or!(pointer(a), T(b))
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray(T[0])
+        @cuda threads=10 kernel(a, T)
+        @test Array(a)[1] == 1023
+    end
+end
+
+@testset "atomic_xor" begin
+    function kernel(a, T)
+        i = threadIdx().x
+        b = 1  # 2^(i-1)
+        for i = 1:i
+            b *= 2
+        end
+        b /= 2
+        CUDA.atomic_xor!(pointer(a), T(b))
+        return
+    end
+    @testset for T in [Int32, Int64, UInt32, UInt64]
+        a = CuArray(T[1023])
+        @cuda threads=10 kernel(a, T)
+        @test Array(a)[1] == 0
+    end
+end
+
+@testset "atomic_cas" begin
+    types = [Int32, Int64, UInt32, UInt64]
+    capability(device()) >= v"7.0" && append!(types, [UInt16, BFloat16])
+
+    function kernel(a, b, c)
+        CUDA.atomic_cas!(pointer(a), b, c)
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[0])
+        @cuda threads=1024 kernel(a, zero(T), one(T))
+        @test Array(a)[1] == 1
+    end
+end
+
+@testset "atomic_max" begin
+    types = [Int32, Int64, UInt32, UInt64]
+
+    function kernel(a, T)
+        i = threadIdx().x
+        CUDA.atomic_max!(pointer(a), T(i))
+        return
+    end
+
+    @testset for T in types
+        a = CuArray([zero(T)])
+        @cuda threads=1024 kernel(a, T)
+        @test Array(a)[1] == 1024
+    end
+end
+
+@testset "atomic_min" begin
+    types = [Int32, Int64, UInt32, UInt64]
+
+    function kernel(a, T)
+        i = threadIdx().x
+        CUDA.atomic_min!(pointer(a), T(i))
+        return
+    end
+
+    @testset for T in types
+        a = CuArray(T[1024])
+        @cuda threads=1024 kernel(a, T)
+        @test Array(a)[1] == 1
+    end
+end
+
+@testset "shared memory" begin
+    @testset "simple" begin
+        function kernel()
+            shared = CuStaticSharedArray(Float32, 1)
+            CUDA.atomic_add!(pointer(shared, threadIdx().x), 0f0)
+            return
+        end
+
+        CUDA.@sync @cuda kernel()
+    end
+
+    @testset "shared memory reduction" begin
+        # test that atomic operations on shared memory work
+        # https://github.com/JuliaGPU/CUDA.jl/issues/311
+
+        function kernel(a)
+            b = CUDA.CuStaticSharedArray(Int, 1)
+
+            if threadIdx().x == 1
+                b[] = a[]
+            end
+            sync_threads()
+
+            CUDA.atomic_add!(pointer(b), 1)
+            sync_threads()
+
+            if threadIdx().x == 1
+                a[] = b[]
+            end
+            return
+        end
+
+        a = CuArray([0])
+        @cuda threads=16 kernel(a)
+        @test Array(a) == [16]
+    end
+
+    @testset "shared memory bug" begin
+        # shared memory atomics resulted in illegal memory accesses
+        # https://github.com/JuliaGPU/CUDA.jl/issues/558
+
+        function kernel()
+            tid = threadIdx().x
+            shared = CuStaticSharedArray(Float32, 4)
+            CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
+            sync_threads()
+            CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
+            return
+        end
+
+        @cuda threads=2 kernel()
+        synchronize()
+    end
+end
+
+end # low-level atomics

From 7437924ecf478aa03277547a8bc7b3c0736a7df8 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 11 Mar 2023 21:42:22 -0500
Subject: [PATCH 18/32] add fallback for < sm_60

---
 src/device/intrinsics/atomics.jl | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index d2e05c1f8c..423c7af317 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -431,7 +431,16 @@ end
         atomic_thread_fence(order, scope)
         return val
     else
-        error("Atomics are only supported on SM_60")
+        # Fallback to threadfence w/o order + load_volatile
+        if order == seq_cst
+            threadfence(scope)
+        end
+        val = __load_volatile(ptr)
+        if order == monotonic
+            return val
+        end
+        threadfence(scope)
+        return val
     end
 end
 
@@ -489,7 +498,11 @@ end
         end
         __store_volatile!(ptr, val)
     else
-        error("Atomics are only supported on SM_60")
+        # Fallback to threadfence w/o order + store_volatile
+        if order == seq_cst
+            threadfence(scope)
+        end
+        __store_volatile!(ptr, val)
     end
 end
 
@@ -580,7 +593,15 @@ end
             atomic_thread_fence(seq_cst, scope)
         end
     else
-        error("Atomics are only supported on SM_60")
+        # Fallback to atomic_cas w/o scope on pre SM_60
+        if order == seq_cst || order == acq_rel || order == release
+            threadfence(scope)
+        end
+        val = atomic_cas!(ptr, expected, new)
+        if order == seq_cst || order == acq_rel || order == acquire # order == consume
+            threadfence(scope)
+        end
+        return val
     end
     success = expected === old # egal since otherwise NaN's won't work.
     return (; old, success)

From 5d63f5f1d883f5898cb685f16c9a04bcf1445961 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 11 Mar 2023 22:05:24 -0500
Subject: [PATCH 19/32] add __cas_volatile_16 and global/shared

---
 src/device/intrinsics/atomics.jl | 54 ++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 423c7af317..55da7ae0c5 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -535,14 +535,22 @@ Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b))
     @assert(false)
 end
 
-for (order, scope) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
-                                        (BlockScope, DeviceScope, SystemScope))
-    asm_b64 = "atom.cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
-    asm_b32 = "atom.cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
-    @eval @inline __cas_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
-    @eval @inline __cas_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
+for (order, scope, A) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                                           (BlockScope, DeviceScope, SystemScope),
+                                           (AS.Generic, AS.Global, AS.Shared))
+    if A == AS.Global
+        as = ".global"
+    elseif A == AS.Shared
+        as = ".shared"
+    else
+        as = ""
+    end
+    asm_b64 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
+    asm_b32 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
+    @eval @inline __cas_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
+        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+    @eval @inline __cas_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
+        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
 @inline function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
@@ -555,17 +563,31 @@ end
     end
 end
 
-for scope in (BlockScope, DeviceScope, SystemScope)
-    asm_b64 = "atom.cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
-    asm_b32 = "atom.cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
-    @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
-        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
-    @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, AS}, old::T, new::T, ::$scope) where {T, AS} =
-        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, AS}, T, T}, ptr, old, new)
+for (scope, A) in Iterators.product((BlockScope, DeviceScope, SystemScope),
+                                     (AS.Generic, AS.Global, AS.Shared))
+    if A == AS.Global
+        as = ".global"
+    elseif A == AS.Shared
+        as = ".shared"
+    else
+        as = ""
+    end
+
+    asm_b64 = "atom$(as).cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
+    asm_b32 = "atom$(as).cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
+    asm_b16 = "atom$(as).cas.$(asm(scope)).b16 \$0,[\$1],\$2,\$3;"
+    @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
+        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+    @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
+        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+    @eval @inline __cas_volatile_16!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
+        @asmcall($asm_b32, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
 @inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
-    if sizeof(T) == 4
+    if sizeof(T) == 2
+        __cas_volatile_16!(ptr, old, new, scope)
+    elseif sizeof(T) == 4
         __cas_volatile_32!(ptr, old, new, scope)
     elseif sizeof(T) == 8
         __cas_volatile_64!(ptr, old, new, scope)

From 450808bb4f16edaa8e847f6ee3263dd605a7fb3d Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 12 Mar 2023 01:29:33 -0500
Subject: [PATCH 20/32] no I am not losing my mind

---
 src/device/intrinsics/atomics.jl | 166 +++++++++++++++++++------------
 1 file changed, 101 insertions(+), 65 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 55da7ae0c5..57627efef4 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -366,51 +366,94 @@ asm(::Type{SystemScope}) = :sys
 asm(::Type{DeviceScope}) = :gpu
 asm(::Type{BlockScope}) = :cta
 
-for (order, scope) in Iterators.product((LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
-                                        (BlockScope, DeviceScope, SystemScope))
-    asm_b64 = "ld.$(asm(order)).$(asm(scope)).b64 \$0, [\$1];"
-    asm_b32 = "ld.$(asm(order)).$(asm(scope)).b32 \$0, [\$1];"
-    @eval @inline __load_64(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b64, "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
-    @eval @inline __load_32(ptr::LLVMPtr{T, AS}, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b32, "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
+function suffix(sz)
+    if sz == 1
+        "b8"
+    elseif sz == 2
+        "b16"
+    elseif sz == 4
+        "b32"
+    elseif sz == 8
+        "b64"
+    end
 end
 
-@inline function __load(ptr::LLVMPtr{T}, order, scope) where T
-    if sizeof(T) == 4
-        __load_32(ptr, order, scope)
-    elseif sizeof(T) == 8
-        __load_64(ptr, order, scope)
+function reg(sz)
+    if sz == 1
+        "r"
+    elseif sz == 2
+        "h"
+    elseif sz == 4
+        "r"
+    elseif sz == 8
+        "l"
+    end
+end
+
+function addr_space(A)
+    if A == AS.Global
+        as = ".global"
+    elseif A == AS.Shared
+        as = ".shared"
     else
-        throw(AtomicUnsupported{T}())
+        as = ""
     end
 end
 
-__supports_atomic(::Type{T}) where T = sizeof(T) == 4 || sizeof(T) == 8
+for (order, scope, A, sz) in Iterators.product(
+                                (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
+                                (BlockScope, DeviceScope, SystemScope),
+                                (AS.Generic, AS.Global, AS.Shared),
+                                (2,4,8))
+    instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1];"
+    constraint  = "=$(reg(sz)),l,~{memory}"
+    @eval @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr)
+end
 
-# Could be done using LLVM
-# TODO: Register choice for Float32/Float64
-@inline function __load_volatile(ptr::LLVMPtr{T, AS}) where {T, AS}
-    if sizeof(T) == 1
-        val = @asmcall("ld.volatile.b8  \$0, [\$1];", "=r,l,~{memory}", true, UInt32, Tuple{LLVMPtr{T, AS}}, ptr)
-        return val % T
-    elseif sizeof(T) == 2
-        val = @asmcall("ld.volatile.b16 \$0, [\$1];", "=h,l,~{memory}", true, UInt16, Tuple{LLVMPtr{T, AS}}, ptr)
-        return Core.bitcast(T, val) # Float16 otherwise complaints
-    elseif sizeof(T) == 4
-        @asmcall("ld.volatile.b32 \$0, [\$1];", "=r,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
-    elseif sizeof(T) == 8
-        @asmcall("ld.volatile.b64 \$0, [\$1];", "=l,l,~{memory}", true, T, Tuple{LLVMPtr{T, AS}}, ptr)
-    else
-        throw(AtomicUnsupported{T}())
+# Handle byte sized load
+for (order, scope, A) in Iterators.product(
+                            (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
+                            (BlockScope, DeviceScope, SystemScope),
+                            (AS.Generic, AS.Global, AS.Shared))
+    instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).b8 \$0, [\$1];"
+    constraint  = "=r,l,~{memory}"
+    @eval function @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
+        return Core.bitcast(T, val % UInt8)
+    end
+end
+
+@inline __load(ptr::LLVMPtr{T}, order, scope) where T =
+    __load(Val(sizeof(T)), ptr, order, scope)
+
+for (A, sz) in Iterators.product(
+                    (AS.Generic, AS.Global, AS.Shared),
+                    (2,4,8))
+    instruction = "ld$(addr_space(A)).volatile.$(suffix(sz)) \$0, [\$1];"
+    constraint  = "=$(reg(sz)),l,~{memory}"
+    @eval @inline __load_volatile(::Val{$sz}, ptr::LLVMPtr{T, $A}) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr)
+end
+
+# Handle byte sized load
+for (A) in (AS.Generic, AS.Global, AS.Shared)
+    instruction = "ld$(addr_space(A)).volatile.b8 \$0, [\$1];"
+    constraint  = "=r,l,~{memory}"
+    @eval @inline function __load_volatile(::Val{1}, ptr::LLVMPtr{T, $A}) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
+        return Core.bitcast(T, val % UInt8)
     end
 end
 
+@inline __load_volatile(ptr::LLVMPtr{T}) where {T} =
+    __load_volatile(Val(sizeof(T)), ptr)
+
 @inline function atomic_load(ptr::LLVMPtr{T}, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == release
         throw(AtomicOrderUnsupported(order))
     end
-    if compute_capability() >= sv"7.0" && __supports_atomic(T)
+    if compute_capability() >= sv"7.0"
         if order == monotonic
             val = __load(ptr, monotonic, scope)
             return val
@@ -444,46 +487,40 @@ end
     end
 end
 
-for (order, scope) in Iterators.product((LLVMOrdering{:release}, LLVMOrdering{:monotonic}),
-                                        (BlockScope, DeviceScope, SystemScope))
-    asm_b64 = "st.$(asm(order)).$(asm(scope)).b64 [\$0], \$1;"
-    asm_b32 = "st.$(asm(order)).$(asm(scope)).b32 [\$0], \$1;"
-    @eval @inline __store_64!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b64, "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-    @eval @inline __store_32!(ptr::LLVMPtr{T, AS}, val::T, ::$order, ::$scope) where {T, AS} =
-        @asmcall($asm_b32, "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
+for (order, scope, A, sz) in Iterators.product(
+                            (LLVMOrdering{:release}, LLVMOrdering{:monotonic}),
+                            (BlockScope, DeviceScope, SystemScope),
+                            (AS.Generic, AS.Global, AS.Shared),
+                            (1, 2, 4, 8))
+    instruction = "st$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) [\$0], \$1;"
+    constraint  = "l,$(reg(sz)),~{memory}"
+    @eval @inline __store!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T, ::$order, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val)
 end
 
-@inline function __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T
-    if sizeof(T) == 4
-        __store_32!(ptr, val, order, scope)
-    elseif sizeof(T) == 8
-        __store_64!(ptr, val, order, scope)
-    else
-        throw(AtomicUnsupported{T}())
-    end
+@inline __store!(ptr::LLVMPtr{T}, val::T, order, scope) where T =
+    __store!(Val(sizeof(T)), ptr, val, order, scope)
+
+for (A, sz) in Iterators.product(
+                (LLVMOrdering{:release}, LLVMOrdering{:monotonic}),
+                (BlockScope, DeviceScope, SystemScope),
+                (AS.Generic, AS.Global, AS.Shared),
+                (1, 2, 4, 8))
+    instruction = "st$(addr_space(A)).volatile.$(suffix(sz)) [\$0], \$1;"
+    constraint  = "l,$(reg(sz)),~{memory}"
+    @eval @inline __store_volatile!(::Val{$sz}, ptr::LLVMPtr{T, $A}, val::T) where {T} =
+        @asmcall($instruction, $constraint, true, Cvoid, Tuple{LLVMPtr{T, $A}, T}, ptr, val)
 end
 
 # Could be done using LLVM.
-@inline function __store_volatile!(ptr::LLVMPtr{T, AS}, val::T) where {T, AS}
-    if sizeof(T) == 1
-        @asmcall("st.volatile.b8 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-    elseif sizeof(T) == 2
-        @asmcall("st.volatile.b16 [\$0], \$1;", "l,h,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-    elseif sizeof(T) == 4
-        @asmcall("st.volatile.b32 [\$0], \$1;", "l,r,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-    elseif sizeof(T) == 8
-        @asmcall("st.volatile.b64 [\$0], \$1;", "l,l,~{memory}", true, Cvoid, Tuple{LLVMPtr{T, AS}, T}, ptr, val)
-    else
-        throw(AtomicUnsupported{T}())
-    end
-end
+@inline __store_volatile!(ptr::LLVMPtr{T}, val::T) where {T} =
+    __store_volatile(Val(sizeof(T)), ptr, val)
 
 @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == acquire # || order == consume
         throw(AtomicOrderUnsupported(order))
     end
-    if compute_capability() >= sv"7.0" && __supports_atomic(T)
+    if compute_capability() >= sv"7.0"
         if order == release
             __store!(ptr, val, release, scope)
             return
@@ -581,7 +618,7 @@ for (scope, A) in Iterators.product((BlockScope, DeviceScope, SystemScope),
     @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
         @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
     @eval @inline __cas_volatile_16!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
-        @asmcall($asm_b32, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+        @asmcall($asm_b16, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
 @inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
@@ -598,7 +635,7 @@ end
 
 @inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
-    if compute_capability() >= sv"7.0" && __supports_atomic(T)
+    if compute_capability() >= sv"7.0" && 2 <= sizeof(T) <= 4
         if order == seq_cst
             atomic_thread_fence(seq_cst, scope)
         end
@@ -619,11 +656,10 @@ end
         if order == seq_cst || order == acq_rel || order == release
             threadfence(scope)
         end
-        val = atomic_cas!(ptr, expected, new)
+        old = atomic_cas!(ptr, expected, new)
         if order == seq_cst || order == acq_rel || order == acquire # order == consume
             threadfence(scope)
         end
-        return val
     end
     success = expected === old # egal since otherwise NaN's won't work.
     return (; old, success)

From 353e0367b7211f276b252d1b8bcb6b8cdb9a3b89 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 12 Mar 2023 10:29:05 -0400
Subject: [PATCH 21/32] fixup! no I am not losing my mind

---
 src/device/intrinsics/atomics.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 57627efef4..b6703d87a7 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -418,7 +418,7 @@ for (order, scope, A) in Iterators.product(
                             (AS.Generic, AS.Global, AS.Shared))
     instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).b8 \$0, [\$1];"
     constraint  = "=r,l,~{memory}"
-    @eval function @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T}
+    @eval @inline function __load(::Val{1}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T}
         val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
         return Core.bitcast(T, val % UInt8)
     end
@@ -502,8 +502,6 @@ end
     __store!(Val(sizeof(T)), ptr, val, order, scope)
 
 for (A, sz) in Iterators.product(
-                (LLVMOrdering{:release}, LLVMOrdering{:monotonic}),
-                (BlockScope, DeviceScope, SystemScope),
                 (AS.Generic, AS.Global, AS.Shared),
                 (1, 2, 4, 8))
     instruction = "st$(addr_space(A)).volatile.$(suffix(sz)) [\$0], \$1;"

From 334fad2d8b781067b58929ca2f04d34f071be15a Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 12 Mar 2023 10:45:56 -0400
Subject: [PATCH 22/32] cleanup cas

---
 src/device/intrinsics/atomics.jl | 94 ++++++++++----------------------
 1 file changed, 30 insertions(+), 64 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index b6703d87a7..4d6e5ab65e 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -570,66 +570,32 @@ Base.isless(a::LLVMOrdering, b::LLVMOrdering) = isless(order(a), order(b))
     @assert(false)
 end
 
-for (order, scope, A) in Iterators.product((LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
-                                           (BlockScope, DeviceScope, SystemScope),
-                                           (AS.Generic, AS.Global, AS.Shared))
-    if A == AS.Global
-        as = ".global"
-    elseif A == AS.Shared
-        as = ".shared"
-    else
-        as = ""
-    end
-    asm_b64 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
-    asm_b32 = "atom$(as).cas.$(asm(order)).$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
-    @eval @inline __cas_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
-        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
-    @eval @inline __cas_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
-        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+for (order, scope, A, sz) in Iterators.product(
+                                (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                                (BlockScope, DeviceScope, SystemScope),
+                                (AS.Generic, AS.Global, AS.Shared),
+                                (4, 8))
+    instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
+    constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
+    @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
-@inline function __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T
-    if sizeof(T) == 4
-        __cas_32!(ptr, old, new, order, scope)
-    elseif sizeof(T) == 8
-        __cas_64!(ptr, old, new, order, scope)
-    else
-        throw(AtomicUnsupported{T}())
-    end
-end
+@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T =
+    __cas(sizeof(T), ptr, old, new, order, scope)
 
-for (scope, A) in Iterators.product((BlockScope, DeviceScope, SystemScope),
-                                     (AS.Generic, AS.Global, AS.Shared))
-    if A == AS.Global
-        as = ".global"
-    elseif A == AS.Shared
-        as = ".shared"
-    else
-        as = ""
-    end
-
-    asm_b64 = "atom$(as).cas.$(asm(scope)).b64 \$0,[\$1],\$2,\$3;"
-    asm_b32 = "atom$(as).cas.$(asm(scope)).b32 \$0,[\$1],\$2,\$3;"
-    asm_b16 = "atom$(as).cas.$(asm(scope)).b16 \$0,[\$1],\$2,\$3;"
-    @eval @inline __cas_volatile_64!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
-        @asmcall($asm_b64, "=l,l,l,l,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
-    @eval @inline __cas_volatile_32!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
-        @asmcall($asm_b32, "=r,l,r,r,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
-    @eval @inline __cas_volatile_16!(ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
-        @asmcall($asm_b16, "=h,l,h,h,~{memory}", true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+for (scope, A, sz) in Iterators.product(
+                                (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                                (AS.Generic, AS.Global, AS.Shared),
+                                (2, 4, 8))
+    instruction = "atom$(addr_space(A)).cas.$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
+    constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
+    @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
+        @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
-@inline function __cas_volatile!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T
-    if sizeof(T) == 2
-        __cas_volatile_16!(ptr, old, new, scope)
-    elseif sizeof(T) == 4
-        __cas_volatile_32!(ptr, old, new, scope)
-    elseif sizeof(T) == 8
-        __cas_volatile_64!(ptr, old, new, scope)
-    else
-        throw(AtomicUnsupported{T}())
-    end
-end
+@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T =
+    __cas!(Val(sizeof(T)), ptr, old, new, scope)
 
 @inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
@@ -645,7 +611,7 @@ end
         if order == seq_cst || order == acq_rel || order == release
             atomic_thread_fence(seq_cst, scope)
         end
-        old = __cas_volatile!(ptr, expected, new, scope)
+        old = __cas!(ptr, expected, new, scope)
         if order == seq_cst || order == acq_rel || order == acquire # order == consume
             atomic_thread_fence(seq_cst, scope)
         end
@@ -659,8 +625,7 @@ end
             threadfence(scope)
         end
     end
-    success = expected === old # egal since otherwise NaN's won't work.
-    return (; old, success)
+    return old
 end
 
 #
@@ -695,14 +660,15 @@ end
 end
 
 @inline function modify!(ptr::LLVMPtr{T}, op::OP, x, order) where {T, OP}
-    success = false
-    expected = atomic_load(ptr, order)
-    local new::T
-    while !success
+    old = atomic_load(ptr, order)
+    while true
+        expected = old
         new = op(expected, x)
-        expected, success = atomic_cas!(ptr, expected, new, order, monotonic)
+        old = atomic_cas!(ptr, expected, new, order, monotonic)
+        if old === expected
+            return expected => new
+        end
     end
-    return expected => new
 end
 
 @inline function Atomix.modify!(ref::CuIndexableRef, op::OP, x, order) where {OP}

From 68bf406574094e6b70da73cc2acf1b97aecf3521 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 12 Mar 2023 15:16:29 -0400
Subject: [PATCH 23/32] try to test sm_60, sm_35

---
 src/compiler/execution.jl                  |  2 +-
 src/compiler/gpucompiler.jl                |  2 +-
 src/compiler/reflection.jl                 |  2 +-
 src/device/intrinsics/atomics.jl           |  8 ++---
 test/device/intrinsics/lowlevel_atomics.jl | 34 ++++++++++++++++------
 5 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 3ac23cb4dc..c16277d1fd 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -40,7 +40,7 @@ macro cuda(ex...)
     macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
         split_kwargs(kwargs,
                      [:dynamic, :launch],
-                     [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline],
+                     [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline, :cap],
                      [:cooperative, :blocks, :threads, :shmem, :stream])
     if !isempty(other_kwargs)
         key,val = first(other_kwargs).args
diff --git a/src/compiler/gpucompiler.jl b/src/compiler/gpucompiler.jl
index bd39307b3a..a6364ebbbc 100644
--- a/src/compiler/gpucompiler.jl
+++ b/src/compiler/gpucompiler.jl
@@ -15,7 +15,7 @@ function device_properties(dev)
             cap = maximum(caps)
 
             # select the PTX ISA we assume to be available
-            # (we actually only need 6.2, but NVPTX doesn't support that)
+            # 6.3 introduced `atom.cas.b16`
             ptx = v"6.3"
 
             # we need to take care emitting LLVM instructions like `unreachable`, which
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
index 19e9f66d3d..4a1cb4a36d 100644
--- a/src/compiler/reflection.jl
+++ b/src/compiler/reflection.jl
@@ -125,7 +125,7 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
         function $method(io::IO, @nospecialize(func), @nospecialize(types);
                          kernel::Bool=false, minthreads=nothing, maxthreads=nothing,
                          blocks_per_sm=nothing, maxregs=nothing, always_inline::Bool=false,
-                         kwargs...)
+                         cap=capability(device()), kwargs...)
             source = FunctionSpec(func, Base.to_tuple_type(types), kernel)
             target = CUDACompilerTarget(device(); minthreads, maxthreads, blocks_per_sm, maxregs)
             params = CUDACompilerParams()
diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 4d6e5ab65e..e8fe0f0cc6 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -405,7 +405,7 @@ for (order, scope, A, sz) in Iterators.product(
                                 (BlockScope, DeviceScope, SystemScope),
                                 (AS.Generic, AS.Global, AS.Shared),
                                 (2,4,8))
-    instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1];"
+    instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).$(suffix(sz)) \$0, [\$1];"
     constraint  = "=$(reg(sz)),l,~{memory}"
     @eval @inline __load(::Val{$sz}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T} =
         @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr)
@@ -416,7 +416,7 @@ for (order, scope, A) in Iterators.product(
                             (LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}),
                             (BlockScope, DeviceScope, SystemScope),
                             (AS.Generic, AS.Global, AS.Shared))
-    instruction = "ld$(addr_space(A)).$(asm(order)).$(asm(scope)).b8 \$0, [\$1];"
+    instruction = "ld.$(asm(order)).$(asm(scope))$(addr_space(A)).b8 \$0, [\$1];"
     constraint  = "=r,l,~{memory}"
     @eval @inline function __load(::Val{1}, ptr::LLVMPtr{T, $A}, ::$order, ::$scope) where {T}
         val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
@@ -430,7 +430,7 @@ end
 for (A, sz) in Iterators.product(
                     (AS.Generic, AS.Global, AS.Shared),
                     (2,4,8))
-    instruction = "ld$(addr_space(A)).volatile.$(suffix(sz)) \$0, [\$1];"
+    instruction = "ld.volatile$(addr_space(A)).$(suffix(sz)) \$0, [\$1];"
     constraint  = "=$(reg(sz)),l,~{memory}"
     @eval @inline __load_volatile(::Val{$sz}, ptr::LLVMPtr{T, $A}) where {T} =
         @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}}, ptr)
@@ -438,7 +438,7 @@ end
 
 # Handle byte sized load
 for (A) in (AS.Generic, AS.Global, AS.Shared)
-    instruction = "ld$(addr_space(A)).volatile.b8 \$0, [\$1];"
+    instruction = "ld.volatile$(addr_space(A)).b8 \$0, [\$1];"
     constraint  = "=r,l,~{memory}"
     @eval @inline function __load_volatile(::Val{1}, ptr::LLVMPtr{T, $A}) where {T}
         val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}}, ptr)
diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl
index 7122cd984f..0bedb80a3e 100644
--- a/test/device/intrinsics/lowlevel_atomics.jl
+++ b/test/device/intrinsics/lowlevel_atomics.jl
@@ -1,25 +1,41 @@
 using BFloat16s: BFloat16
 
+function atomic_types(cap)
+    types = [
+        Int32, Int64, 
+        UInt32, UInt64,
+        Float64, Float32]
+    if cap >= v"6.0"
+        append!(types, [
+            Int8, Int16,
+            UInt8, UInt16,
+            Float16])
+    end
+    return types
+end
+
 @testset "atomics (low-level) with order" begin
 
 @testset "atomic_load" begin
-    if capability(device()) >= v"6.0"
-        types = [Int8, Int16, Int32, Int64, 
-                 UInt8, UInt16, UInt32, UInt64,
-                 Float64, Float32]
+    capabilities = (v"3.5", v"6.0", v"7.0")
+    current_cap = capability(device())
+
+    capabilities = filter(c->c<=current_cap, capabilities)
+
+    @testset for cap in capabilities
+        types = atomic_types(cap)
         scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
-        # TODO unordered
-        supported_orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
-        unsupported_orders = [CUDA.release, CUDA.acq_rel]
+        orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
+        # unsupported_orders = [CUDA.release, CUDA.acq_rel]
 
         function kernel(a, order, scope)
             CUDA.atomic_load(pointer(a), order, scope)
             return
         end
 
-        @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
+        @testset for (T, order, scope) in Iterators.product(types, orders, scopes)
             a = CuArray(T[0])
-            @cuda threads=1 kernel(a, order, scope)
+            @cuda cap=cap threads=1 kernel(a, order, scope)
             @test Array(a)[1] == 0
         end
     end

From f248574ccc75f6142047fb38de1e3373677a9485 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 13 Mar 2023 21:33:25 -0400
Subject: [PATCH 24/32] fix yet another silly mistake

---
 src/device/intrinsics/synchronization.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index e98baa16a2..fd59214452 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -131,11 +131,11 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope)
             throw(AtomicOrderUnsupported(order))
         end
     else
-        if order == seq_cst() ||
-           order == consume() ||
-           order == acquire() ||
-           order == acq_rel() ||
-           order == release()
+        if order == seq_cst ||
+           order == consume ||
+           order == acquire ||
+           order == acq_rel ||
+           order == release
 
             threadfence_device()
         else

From 992630ec189c66ac18dfc5273e5af1308ae02205 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:03:15 -0400
Subject: [PATCH 25/32] walkback sm_50 support

---
 src/compiler/execution.jl                  |  2 +-
 src/device/intrinsics/atomics.jl           | 26 ++--------------
 test/device/intrinsics/lowlevel_atomics.jl | 36 ++++++----------------
 3 files changed, 13 insertions(+), 51 deletions(-)

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index c16277d1fd..3ac23cb4dc 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -40,7 +40,7 @@ macro cuda(ex...)
     macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
         split_kwargs(kwargs,
                      [:dynamic, :launch],
-                     [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline, :cap],
+                     [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :always_inline],
                      [:cooperative, :blocks, :threads, :shmem, :stream])
     if !isempty(other_kwargs)
         key,val = first(other_kwargs).args
diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index e8fe0f0cc6..cf60d2d077 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -474,16 +474,7 @@ end
         atomic_thread_fence(order, scope)
         return val
     else
-        # Fallback to threadfence w/o order + load_volatile
-        if order == seq_cst
-            threadfence(scope)
-        end
-        val = __load_volatile(ptr)
-        if order == monotonic
-            return val
-        end
-        threadfence(scope)
-        return val
+        throw(AtomicUnsupported{T}())
     end
 end
 
@@ -533,11 +524,7 @@ end
         end
         __store_volatile!(ptr, val)
     else
-        # Fallback to threadfence w/o order + store_volatile
-        if order == seq_cst
-            threadfence(scope)
-        end
-        __store_volatile!(ptr, val)
+        throw(AtomicUnsupported{T}())
     end
 end
 
@@ -616,14 +603,7 @@ end
             atomic_thread_fence(seq_cst, scope)
         end
     else
-        # Fallback to atomic_cas w/o scope on pre SM_60
-        if order == seq_cst || order == acq_rel || order == release
-            threadfence(scope)
-        end
-        old = atomic_cas!(ptr, expected, new)
-        if order == seq_cst || order == acq_rel || order == acquire # order == consume
-            threadfence(scope)
-        end
+        throw(AtomicUnsupported{T}())
     end
     return old
 end
diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl
index 0bedb80a3e..dfe2c06e96 100644
--- a/test/device/intrinsics/lowlevel_atomics.jl
+++ b/test/device/intrinsics/lowlevel_atomics.jl
@@ -1,32 +1,15 @@
 using BFloat16s: BFloat16
 
-function atomic_types(cap)
-    types = [
-        Int32, Int64, 
-        UInt32, UInt64,
-        Float64, Float32]
-    if cap >= v"6.0"
-        append!(types, [
-            Int8, Int16,
-            UInt8, UInt16,
-            Float16])
-    end
-    return types
-end
-
 @testset "atomics (low-level) with order" begin
 
 @testset "atomic_load" begin
-    capabilities = (v"3.5", v"6.0", v"7.0")
-    current_cap = capability(device())
-
-    capabilities = filter(c->c<=current_cap, capabilities)
-
-    @testset for cap in capabilities
-        types = atomic_types(cap)
+    if capability(device()) >= v"6.0"
+        types = [Int8, Int16, Int32, Int64, 
+                 UInt8, UInt16, UInt32, UInt64,
+                 Float64, Float32]
+        # TODO Float16
         scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
         orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
-        # unsupported_orders = [CUDA.release, CUDA.acq_rel]
 
         function kernel(a, order, scope)
             CUDA.atomic_load(pointer(a), order, scope)
@@ -35,7 +18,7 @@ end
 
         @testset for (T, order, scope) in Iterators.product(types, orders, scopes)
             a = CuArray(T[0])
-            @cuda cap=cap threads=1 kernel(a, order, scope)
+            @cuda threads=1 kernel(a, order, scope)
             @test Array(a)[1] == 0
         end
     end
@@ -46,17 +29,16 @@ end
         types = [Int8, Int16, Int32, Int64, 
                  UInt8, UInt16, UInt32, UInt64,
                  Float64, Float32]
+        # TODO Float16
         scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
-        # TODO unordered
-        supported_orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
-        unsupported_orders = [CUDA.acquire, CUDA.acq_rel]
+        orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
 
         function kernel(a, val, order, scope)
             CUDA.atomic_store!(pointer(a), val, order, scope)
             return
         end
 
-        @testset for (T, order, scope) in Iterators.product(types, supported_orders, scopes)
+        @testset for (T, order, scope) in Iterators.product(types, orders, scopes)
             a = CuArray(T[0])
             @cuda threads=1 kernel(a, one(T), order, scope)
             @test Array(a)[1] == one(T)

From fa0e3a290afca3b18e278e636db8e50d1d662f73 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:15:12 -0400
Subject: [PATCH 26/32] fix CAS call

---
 src/device/intrinsics/atomics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index cf60d2d077..acef5aaed5 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -569,7 +569,7 @@ for (order, scope, A, sz) in Iterators.product(
 end
 
 @inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T =
-    __cas(sizeof(T), ptr, old, new, order, scope)
+    __cas!(sizeof(T), ptr, old, new, order, scope)
 
 for (scope, A, sz) in Iterators.product(
                                 (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),

From c9396fe7e484564924beba10f689eb266801dd95 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:18:12 -0400
Subject: [PATCH 27/32] fix threadfence on sm_60

---
 src/device/intrinsics/synchronization.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index fd59214452..2cf19e7cf4 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -132,7 +132,7 @@ function atomic_thread_fence(order, scope::DeviceScope=device_scope)
         end
     else
         if order == seq_cst ||
-           order == consume ||
+           # order == consume ||
            order == acquire ||
            order == acq_rel ||
            order == release

From 2f5948807e55eb1bbc6feb5ec729ab60c97ed6cb Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:28:14 -0400
Subject: [PATCH 28/32] fixup! fix CAS call

---
 src/device/intrinsics/atomics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index acef5aaed5..7d041e355d 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -569,7 +569,7 @@ for (order, scope, A, sz) in Iterators.product(
 end
 
 @inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T =
-    __cas!(sizeof(T), ptr, old, new, order, scope)
+    __cas!(Val(sizeof(T)), ptr, old, new, order, scope)
 
 for (scope, A, sz) in Iterators.product(
                                 (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),

From f937f114e7f04b55b22c371937dff10484046ef5 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:28:28 -0400
Subject: [PATCH 29/32] fix store_volatile! call

---
 src/device/intrinsics/atomics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 7d041e355d..65c8348ced 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -503,7 +503,7 @@ end
 
 # Could be done using LLVM.
 @inline __store_volatile!(ptr::LLVMPtr{T}, val::T) where {T} =
-    __store_volatile(Val(sizeof(T)), ptr, val)
+    __store_volatile!(Val(sizeof(T)), ptr, val)
 
 @inline function atomic_store!(ptr::LLVMPtr{T}, val::T, order, scope::SyncScope=device_scope) where T
     if order == acq_rel || order == acquire # || order == consume

From f7e938a24a28727819921a7378d91fd94abf0f55 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:28:41 -0400
Subject: [PATCH 30/32] test Float16 and BFloat16

---
 test/device/intrinsics/lowlevel_atomics.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl
index dfe2c06e96..1722b0af3e 100644
--- a/test/device/intrinsics/lowlevel_atomics.jl
+++ b/test/device/intrinsics/lowlevel_atomics.jl
@@ -6,8 +6,7 @@ using BFloat16s: BFloat16
     if capability(device()) >= v"6.0"
         types = [Int8, Int16, Int32, Int64, 
                  UInt8, UInt16, UInt32, UInt64,
-                 Float64, Float32]
-        # TODO Float16
+                 BFloat16, Float16, Float64, Float32]
         scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
         orders = [CUDA.monotonic, CUDA.acquire, CUDA.seq_cst]
 
@@ -28,8 +27,7 @@ end
     if capability(device()) >= v"6.0"
         types = [Int8, Int16, Int32, Int64, 
                  UInt8, UInt16, UInt32, UInt64,
-                 Float64, Float32]
-        # TODO Float16
+                 BFloat16, Float16, Float64, Float32]
         scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
         orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst]
 

From aed3933fc85813253e32a5ebc0badd2ba726c4ef Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Mar 2023 21:44:42 -0400
Subject: [PATCH 31/32] fix sizeof checks in cas

---
 src/device/intrinsics/atomics.jl           | 19 ++++++++++++++-----
 test/device/intrinsics/lowlevel_atomics.jl |  1 -
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 65c8348ced..a370baa912 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -453,6 +453,9 @@ end
     if order == acq_rel || order == release
         throw(AtomicOrderUnsupported(order))
     end
+    if sizeof(T) > 8
+        throw(AtomicUnsupported{T}())
+    end
     if compute_capability() >= sv"7.0"
         if order == monotonic
             val = __load(ptr, monotonic, scope)
@@ -509,6 +512,9 @@ end
     if order == acq_rel || order == acquire # || order == consume
         throw(AtomicOrderUnsupported(order))
     end
+    if sizeof(T) > 8
+        throw(AtomicUnsupported{T}())
+    end
     if compute_capability() >= sv"7.0"
         if order == release
             __store!(ptr, val, release, scope)
@@ -577,16 +583,19 @@ for (scope, A, sz) in Iterators.product(
                                 (2, 4, 8))
     instruction = "atom$(addr_space(A)).cas.$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
     constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
-    @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
+    @eval @inline __cas_old!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
         @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
-@inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T =
-    __cas!(Val(sizeof(T)), ptr, old, new, scope)
+@inline __cas_old!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T =
+    __cas_old!(Val(sizeof(T)), ptr, old, new, scope)
 
 @inline function atomic_cas!(ptr::LLVMPtr{T}, expected::T, new::T, success_order, failure_order, scope::SyncScope=device_scope) where T
     order = stronger_order(success_order, failure_order)
-    if compute_capability() >= sv"7.0" && 2 <= sizeof(T) <= 4
+    if sizeof(T) > 8 || sizeof(T) < 2
+        throw(AtomicUnsupported{T}())
+    end
+    if compute_capability() >= sv"7.0" && 4 <= sizeof(T) <= 8
         if order == seq_cst
             atomic_thread_fence(seq_cst, scope)
         end
@@ -598,7 +607,7 @@ end
         if order == seq_cst || order == acq_rel || order == release
             atomic_thread_fence(seq_cst, scope)
         end
-        old = __cas!(ptr, expected, new, scope)
+        old = __cas_old!(ptr, expected, new, scope)
         if order == seq_cst || order == acq_rel || order == acquire # order == consume
             atomic_thread_fence(seq_cst, scope)
         end
diff --git a/test/device/intrinsics/lowlevel_atomics.jl b/test/device/intrinsics/lowlevel_atomics.jl
index 1722b0af3e..0db2c30527 100644
--- a/test/device/intrinsics/lowlevel_atomics.jl
+++ b/test/device/intrinsics/lowlevel_atomics.jl
@@ -51,7 +51,6 @@ end
                  UInt32, UInt64,
                  Float64, Float32]
         scopes = [CUDA.block_scope, CUDA.device_scope, CUDA.system_scope]
-        # TODO unordered
         orders = [CUDA.monotonic, CUDA.release, CUDA.seq_cst, CUDA.acquire, CUDA.acq_rel]
 
         function kernel(a, expected, desired, success_order, failure_order, scope)

From 66fadf5014cb33b84ae9aee235e3a1c0c90cd179 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 21 Mar 2023 12:17:52 -0400
Subject: [PATCH 32/32] handle byte size cas

---
 src/device/intrinsics/atomics.jl | 33 ++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index a370baa912..876a456652 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -567,26 +567,51 @@ for (order, scope, A, sz) in Iterators.product(
                                 (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
                                 (BlockScope, DeviceScope, SystemScope),
                                 (AS.Generic, AS.Global, AS.Shared),
-                                (4, 8))
+                                (2, 4, 8))
     instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
     constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
     @eval @inline __cas!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T} =
         @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
+# Handle byte sized cas
+for (order, scope, A) in Iterators.product(
+                            (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                            (BlockScope, DeviceScope, SystemScope),
+                            (AS.Generic, AS.Global, AS.Shared))
+    instruction = "atom.$(addr_space(A)).cas.$(asm(order)).$(asm(scope)).b8 \$0, [\$1];"
+    constraint  = "=r,l,r,r,~{memory}"
+    @eval @inline function __cas!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+        return Core.bitcast(T, val % UInt8)
+    end
+end
+
 @inline __cas!(ptr::LLVMPtr{T}, old::T, new::T, order, scope) where T =
     __cas!(Val(sizeof(T)), ptr, old, new, order, scope)
 
-for (scope, A, sz) in Iterators.product(
+for (order, A, sz) in Iterators.product(
                                 (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
                                 (AS.Generic, AS.Global, AS.Shared),
                                 (2, 4, 8))
-    instruction = "atom$(addr_space(A)).cas.$(asm(scope)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
+    instruction = "atom$(addr_space(A)).cas.$(asm(order)).$(suffix(sz)) \$0, [\$1], \$2, \$3;"
     constraint  = "=$(reg(sz)),l,$(reg(sz)),$(reg(sz)),~{memory}"
     @eval @inline __cas_old!(::Val{$sz}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$scope) where {T} =
         @asmcall($instruction, $constraint, true, T, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
 end
 
+# Handle byte sized cas
+for (order, A) in Iterators.product(
+                            (LLVMOrdering{:acq_rel}, LLVMOrdering{:acquire}, LLVMOrdering{:monotonic}, LLVMOrdering{:release}),
+                            (AS.Generic, AS.Global, AS.Shared))
+    instruction = "atom.$(addr_space(A)).cas.$(asm(order)).b8 \$0, [\$1];"
+    constraint  = "=r,l,r,r,~{memory}"
+    @eval @inline function __cas_old!(::Val{1}, ptr::LLVMPtr{T, $A}, old::T, new::T, ::$order, ::$scope) where {T}
+        val = @asmcall($instruction, $constraint, true, UInt32, Tuple{LLVMPtr{T, $A}, T, T}, ptr, old, new)
+        return Core.bitcast(T, val % UInt8)
+    end
+end
+
 @inline __cas_old!(ptr::LLVMPtr{T}, old::T, new::T, scope) where T =
     __cas_old!(Val(sizeof(T)), ptr, old, new, scope)
 
@@ -595,7 +620,7 @@ end
     if sizeof(T) > 8 || sizeof(T) < 2
         throw(AtomicUnsupported{T}())
     end
-    if compute_capability() >= sv"7.0" && 4 <= sizeof(T) <= 8
+    if compute_capability() >= sv"7.0"
         if order == seq_cst
             atomic_thread_fence(seq_cst, scope)
         end