caused by: NSError: Compute function exceeds available temporary registers (AGXMetal13_3, code 3)
Stacktrace:
[1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
@ Metal.MTL /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/lib/mtl/compute_pipeline.jl:60
[2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
@ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/compilation.jl:71
[3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
@ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/compilation.jl:66
[4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
@ GPUCompiler /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/GPUCompiler/NVLGB/src/execution.jl:132
[5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/GPUCompiler/NVLGB/src/execution.jl:103
[6] macro expansion
@ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:162 [inlined]
[7] macro expansion
@ ./lock.jl:267 [inlined]
[8] mtlfunction(f::typeof(DiffEqGPU.gpu_ode_asolve_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, MtlDeviceVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, 1}, GPURosenbrock23{true}, MtlDeviceMatrix{SVector{20, Float32}, 1}, MtlDeviceMatrix{Float32, 1}, Float32, CallbackSet{Tuple{}, Tuple{}}, Nothing, Float32, Float32, Nothing, Val{false}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:157
[9] mtlfunction(f::typeof(DiffEqGPU.gpu_ode_asolve_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, MtlDeviceVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, 1}, GPURosenbrock23{true}, MtlDeviceMatrix{SVector{20, Float32}, 1}, MtlDeviceMatrix{Float32, 1}, Float32, CallbackSet{Tuple{}, Tuple{}}, Nothing, Float32, Float32, Nothing, Val{false}}})
@ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:155
[10] macro expansion
@ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:77 [inlined]
[11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(DiffEqGPU.gpu_ode_asolve_kernel)})(::MtlVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}}, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
@ Metal.MetalKernels /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/MetalKernels.jl:105
[12] Kernel
@ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/MetalKernels.jl:101 [inlined]
[13] #vectorized_asolve#166
@ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/build/default-macmini-aarch64-3-0/julialang/diffeqgpu-dot-jl/src/solve.jl:182 [inlined]
It's interesting because IIUC the dynamic workgroup size setting there should have used maxTotalThreadsPerThreadgroup, which in the case of CUDA takes register usage into account. Maybe there's additional limits we need to respect with Metal?
As seen on DiffEqGPU.jl:
It's interesting because IIUC the dynamic workgroup size setting there should have used
maxTotalThreadsPerThreadgroup, which in the case of CUDA takes register usage into account. Maybe there's additional limits we need to respect with Metal?