JuliaGPU · maleadt · May 10, 2026 · May 10, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -53,5 +53,5 @@ steps:
         matrix:
           setup:
             julia:
-              - "1.10"
+              - "1.11"
               - "1.12"
diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml
@@ -22,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ['1.10', '1.12']
+        version: ['1.11', '1.12']
         os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-15-intel, windows-2022]
         arch: [x64, arm64]
         pocl: [jll, local]

diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ version = "0.10.9"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -26,10 +27,11 @@ SPIRVIntrinsics = {path = "lib/intrinsics"}
 
 [compat]
 Adapt = "4"
+CompilerCaching = "0.2.4"
 GPUArrays = "11.2.1"
-GPUCompiler = "1.7.1"
+GPUCompiler = "1.10"
 KernelAbstractions = "0.9.38"
-LLVM = "9.1"
+LLVM = "9.6"
 LinearAlgebra = "1"
 OpenCL_jll = "=2024.10.24"
 Preferences = "1"
@@ -42,4 +44,7 @@ SPIRVIntrinsics = "0.5.7"
 SPIRV_LLVM_Backend_jll = "20"
 SPIRV_Tools_jll = "2025.1"
 StaticArrays = "1"
-julia = "1.10"
+julia = "1.11"
+
+[sources]
+GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"}
diff --git a/src/OpenCL.jl b/src/OpenCL.jl
@@ -1,6 +1,7 @@
 module OpenCL
 
 using GPUCompiler
+using CompilerCaching: CacheView, lookup, results
 using LLVM, LLVM.Interop
 using SPIRV_LLVM_Backend_jll, SPIRV_Tools_jll
 using Adapt

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -7,8 +7,46 @@ end
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
 const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}
 
+"""
+    OpenCLResults
+
+Cached compilation results attached to each OpenCL `CodeInstance`. Fields are populated
+through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which
+GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`),
+`obj` (SPIR-V bytes) + `entry` + `device_rng` after main codegen, and `kernels` after
+the session-local link onto an OpenCL context. The first four are session-portable
+(cached through precompilation); `kernels` is session-local.
+
+`kernels` is a small linear cache of `(cl.Context, cl.Kernel)` pairs. The cache partition
+already covers everything that affects codegen via `GPUCompiler.cache_owner`, so the only
+runtime-visible dimension left is the OpenCL context that owns the linked `cl.Kernel`.
+A linear scan with `===` is fastest in the common case (n=1) and stays cheap for the
+rare workload that bounces between a handful of contexts on the same device.
+"""
+mutable struct OpenCLResults
+    bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}}  # (opaque_pointers, bytes)
+    obj::Union{Nothing, Vector{UInt8}}                   # SPIR-V binary
+    entry::Union{Nothing, String}
+    device_rng::Bool
+    kernels::Vector{Tuple{cl.Context, cl.Kernel}}        # session-local; linear-scanned
+    OpenCLResults() = new(nothing, nothing, nothing, false, Tuple{cl.Context, cl.Kernel}[])
+end
+
+function GPUCompiler.bitcode(r::OpenCLResults, opaque_pointers::Bool)
+    r.bitcode === nothing && return nothing
+    stored, bytes = r.bitcode
+    return stored === opaque_pointers ? bytes : nothing
+end
+
+function GPUCompiler.bitcode!(r::OpenCLResults, bytes::Vector{UInt8}, opaque_pointers::Bool)
+    r.bitcode = (opaque_pointers, bytes)
+    return nothing
+end
+
 GPUCompiler.runtime_module(::CompilerJob{<:Any,OpenCLCompilerParams}) = OpenCL
 
+GPUCompiler.results_type(::OpenCLCompilerJob) = OpenCLResults
+
 GPUCompiler.method_table_view(job::OpenCLCompilerJob) =
     GPUCompiler.StackedMethodTable(job.world, method_table, SPIRVIntrinsics.method_table)
 
@@ -111,18 +149,7 @@ function GPUCompiler.finish_linked_module!(@nospecialize(job::OpenCLCompilerJob)
     return
 end
 
-## compiler implementation (cache, configure, compile, and link)
-
-# cache of compilation caches, per context
-const _compiler_caches = Dict{cl.Context, Dict{Any, Any}}()
-function compiler_cache(ctx::cl.Context)
-    cache = get(_compiler_caches, ctx, nothing)
-    if cache === nothing
-        cache = Dict{Any, Any}()
-        _compiler_caches[ctx] = cache
-    end
-    return cache
-end
+## compiler implementation (configure, compile, and link)
 
 # cache of compiler configurations, per device (but additionally configurable via kwargs)
 const _toolchain = Ref{Any}()
@@ -153,37 +180,35 @@ end
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
-# compile to executable machine code
+# run inference + LLVM codegen + SPIR-V emission. returns `(obj, entry, device_rng)`,
+# all session-portable so they survive precompilation when stored on a cached `CodeInstance`.
 const compilations = Threads.Atomic{Int}(0)
-function compile(@nospecialize(job::CompilerJob))
+function compile_to_obj(@nospecialize(job::CompilerJob))
     compilations[] += 1
 
-    # TODO: this creates a context; cache those.
-    obj, meta = JuliaContext() do ctx
+    JuliaContext() do ctx
         obj, meta = GPUCompiler.compile(:obj, job)
-
         entry = LLVM.name(meta.entry)
         device_rng = StringAttribute("julia.opencl.rng", "") in collect(function_attributes(meta.entry))
-
         (; obj, entry, device_rng)
     end
 end
 
-# link into an executable kernel
-function link(@nospecialize(job::CompilerJob), compiled)
+# link the SPIR-V bytes into a session-local `cl.Kernel` on the active context.
+function link_kernel(obj::Vector{UInt8}, entry::String)
     prog = if "cl_khr_il_program" in cl.device().extensions
-        cl.Program(; il=compiled.obj)
+        cl.Program(; il=obj)
     else
         error("Your device does not support SPIR-V, which is currently required for native execution.")
         # XXX: kpet/spirv2clc#87, caused by KhronosGroup/SPIRV-LLVM-Translator#2029
         source = mktempdir() do dir
             il = joinpath(dir, "kernel.spv")
-            write(il, compiled.obj)
+            write(il, obj)
             cmd = `spirv2clc $il`
             read(cmd, String)
         end
         cl.Program(; source)
     end
     cl.build!(prog)
-    (; kernel=cl.Kernel(prog, compiled.entry), compiled.device_rng)
+    return cl.Kernel(prog, entry)
 end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -180,28 +180,52 @@ end
 const clfunction_lock = ReentrantLock()
 
 function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
-    ctx = cl.context()
-    dev = cl.device()
-
     Base.@lock clfunction_lock begin
-        # compile the function
-        cache = compiler_cache(ctx)
+        config = compiler_config(cl.device(); kwargs...)::OpenCLCompilerConfig
         source = methodinstance(F, tt)
-        config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig
-        linked = GPUCompiler.cached_compilation(cache, source, config, compile, link)
-
-        # create a callable object that captures the function instance. we don't need to think
-        # about world age here, as GPUCompiler already does and will return a different object
-        h = hash(linked.kernel, hash(f, hash(tt)))
-        kernel = get(_kernel_instances, h, nothing)
+        job = CompilerJob(source, config)
+        cache = GPUCompiler.cache_view(job)
+
+        # `@something` (not the `something` function) so `compile_opencl!` only runs
+        # on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs
+        # the full LLVM compile on every launch.
+        ci, res = @something lookup(cache, source) compile_opencl!(cache, job)
+
+        # Resolve the cl.Kernel for the active context. Linear scan over the
+        # session-local cache; almost always n=1, so this is one `===` compare.
+        ctx = cl.context()
+        kernel = nothing
+        @inbounds for (cached_ctx, cached_kernel) in res.kernels
+            if cached_ctx === ctx
+                kernel = cached_kernel
+                break
+            end
+        end
         if kernel === nothing
-            # create the kernel state object
-            kernel = HostKernel{F,tt}(f, linked.kernel, linked.device_rng)
-            _kernel_instances[h] = kernel
+            kernel = link_kernel(res.obj::Vector{UInt8}, res.entry::String)
+            push!(res.kernels, (ctx, kernel))
         end
-        return kernel::HostKernel{F,tt}
+
+        h = hash(kernel, hash(f, hash(tt)))
+        get!(_kernel_instances, h) do
+            HostKernel{F,tt}(f, kernel, res.device_rng)
+        end::HostKernel{F,tt}
     end
 end
 
+# Run inference and codegen for `job`, then populate the cached `OpenCLResults` with the
+# session-portable artifacts. The `CodeInstance` is created during inference inside
+# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a
+# fresh `OpenCLResults()` attached via `@setup_caching`'s `finish!` hook.
+function compile_opencl!(cache::CacheView, @nospecialize(job::CompilerJob))
+    compiled = compile_to_obj(job)
+    ci = get(cache, job.source, nothing)::Core.CodeInstance
+    res = results(cache, ci)::OpenCLResults
+    res.obj = compiled.obj
+    res.entry = compiled.entry
+    res.device_rng = compiled.device_rng
+    return (ci, res)
+end
+
 # cache of kernel instances
 const _kernel_instances = Dict{UInt, Any}()
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,5 +53,5 @@ steps: @@
             matrix:
               setup:
                 julia:
-                  - "1.10"
+                  - "1.11"
                   - "1.12"