Accumulate crashes on CUDA with DecoupledLookback

MWE:
```julia
using CUDA
using AcceleratedKernels

v = CUDA.ones(Int32, 2^15)
AcceleratedKernels.accumulate(+, v; init=Int32(0), alg=AcceleratedKernels.DecoupledLookback())
```

gives me a crash log:
```
julia> AcceleratedKernels.accumulate(+, v; init=Int32(0), alg=AcceleratedKernels.DecoupledLookback())
ERROR: LLVM error: Cannot select: 0x2c2518f0: ch = AtomicFence 0x2aac16f0, TargetConstant:i64<4>, TargetConstant:i64<1>, /home/reinha57/.julia/packages/UnsafeAtomics/vpyYB/src/core.jl:248 @[ /home/reinha57/.julia/packages/UnsafeAtomics/vpyYB/src/core.jl:12 @[ /home/reinha57/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate_1d_gpu.jl:174 @[ none:0 ] ] ]
  0x2ded3310: i64 = TargetConstant<4>
  0x2c2511f0: i64 = TargetConstant<1>
In function: _Z25gpu__accumulate_previous_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_10StaticSizeI6_256__ES8_vEE1_13CuDeviceArrayI5Int32Li1ELi1EESF_I5UInt8Li1ELi1EESH_
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/iza6e/src/core/context.jl:194
  [2] LLVMTargetMachineEmitToMemoryBuffer
    @ ~/.julia/packages/LLVM/iza6e/lib/18/libLLVM.jl:11531 [inlined]
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/iza6e/src/targetmachine.jl:118
  [4] mcgen
    @ ~/.julia/packages/GPUCompiler/j4HFa/src/mcgen.jl:75 [inlined]
  [5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ CUDA ~/.julia/packages/CUDA/x8d2s/src/compiler/compilation.jl:127
  [6] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:438
  [7] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:115
  [8] compile_unhooked
    @ ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:80 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:67
 [10] compile
    @ ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:55 [inlined]
 [11] #compile##0
    @ ~/.julia/packages/CUDA/x8d2s/src/compiler/compilation.jl:250 [inlined]
 [12] JuliaContext(f::CUDA.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:34
 [13] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:25
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/x8d2s/src/compiler/compilation.jl:249
 [15] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/execution.jl:245
 [16] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/execution.jl:159
 [17] macro expansion
    @ ~/.julia/packages/CUDA/x8d2s/src/compiler/execution.jl:373 [inlined]
 [18] macro expansion
    @ ./lock.jl:376 [inlined]
 [19] cufunction(f::typeof(AcceleratedKernels.gpu__accumulate_previous!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{…}, typeof(+), CuDeviceVector{…}, CuDeviceVector{…}, CuDeviceVector{…}}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Int64})
    @ CUDA ~/.julia/packages/CUDA/x8d2s/src/compiler/execution.jl:368
 [20] macro expansion
    @ ~/.julia/packages/CUDA/x8d2s/src/compiler/execution.jl:112 [inlined]
 [21] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(AcceleratedKernels.gpu__accumulate_previous!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/x8d2s/src/CUDAKernels.jl:127
 [22] accumulate_1d_gpu!(op::typeof(+), v::CuArray{Int32, 1, CUDA.DeviceMemory}, backend::CUDABackend, ::AcceleratedKernels.DecoupledLookback; init::Int32, neutral::Int32, inclusive::Bool, max_tasks::Int64, min_elems::Int64, block_size::Int64, temp::Nothing, temp_flags::Nothing)
    @ AcceleratedKernels ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate_1d_gpu.jl:307
 [23] accumulate_1d_gpu!
    @ ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate_1d_gpu.jl:257 [inlined]
 [24] _accumulate_impl!(op::typeof(+), v::CuArray{Int32, 1, CUDA.DeviceMemory}, backend::CUDABackend; init::Int32, neutral::Int32, dims::Nothing, inclusive::Bool, alg::AcceleratedKernels.DecoupledLookback, max_tasks::Int64, min_elems::Int64, prefer_threads::Bool, block_size::Int64, temp::Nothing, temp_flags::Nothing)
    @ AcceleratedKernels ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:171
 [25] #accumulate!#99
    @ ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:128 [inlined]
 [26] accumulate(op::Function, v::CuArray{Int32, 1, CUDA.DeviceMemory}, backend::CUDABackend; init::Int32, kwargs::@Kwargs{alg::AcceleratedKernels.DecoupledLookback})
    @ AcceleratedKernels ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:227
 [27] accumulate
    @ ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:219 [inlined]
 [28] top-level scope
    @ REPL[4]:1
Some type information was truncated. Use `show(err)` to see complete types.
```

```
julia> versioninfo()
Julia Version 1.12.3
Commit 966d0af0fdf (2025-12-15 11:20 UTC)
Build Info:
  Official https://julialang.org release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 64 × AMD EPYC 7452 32-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-18.1.7 (ORCJIT, znver2)
  GC: Built with stock GC
Threads: 1 default, 1 interactive, 1 GC (on 64 virtual cores)

julia> CUDA.versioninfo()
CUDA toolchain: 
- runtime 13.0, artifact installation
- driver 580.95.5 for 13.0
- compiler 13.0

CUDA libraries: 
- CUBLAS: 13.1.0
- CURAND: 10.4.0
- CUFFT: 12.0.0
- CUSOLVER: 12.0.4
- CUSPARSE: 12.6.3
- CUPTI: 2025.3.1 (API 13.0.1)
- NVML: 13.0.0+580.95.5

Julia packages: 
- CUDA: 5.9.4
- CUDA_Driver_jll: 13.0.2+0
- CUDA_Compiler_jll: 0.3.0+0
- CUDA_Runtime_jll: 0.19.2+0

Toolchain:
- Julia: 1.12.3
- LLVM: 18.1.7

1 device:
  0: NVIDIA A30 (sm_80, 10.925 GiB / 24.000 GiB available)
```

relevant package versions:
```
(@v1.12) pkg> status
Status `~/.julia/environments/v1.12/Project.toml`
  [21141c5a] AMDGPU v2.1.4
  [6a4ca0a5] AcceleratedKernels v0.4.3
  [052768ef] CUDA v5.9.5
  [63c18a36] KernelAbstractions v0.9.39
  [295af30f] Revise v3.12.3
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Accumulate crashes on CUDA with DecoupledLookback #74

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Accumulate crashes on CUDA with DecoupledLookback #74

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions