JuliaGPU · maleadt · May 10, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -28,7 +28,6 @@ steps:
         matrix:
           setup:
             julia:
-              - "1.10"
               - "1.11"
               - "1.12"
 

diff --git a/Project.toml b/Project.toml
@@ -10,6 +10,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
+CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
@@ -41,9 +42,10 @@ Adapt = "4.5"
 BFloat16s = "0.5, 0.6"
 CEnum = "0.4, 0.5"
 CodecBzip2 = "0.8.5"
+CompilerCaching = "0.2.4"
 ExprTools = "0.1"
 GPUArrays = "11.5"
-GPUCompiler = "1.7.1"
+GPUCompiler = "1.10"
 GPUToolbox = "0.1, 0.2, 0.3, 1"
 KernelAbstractions = "0.9.38"
 LLVM = "7.2, 8, 9"
@@ -61,4 +63,7 @@ ScopedValues = "1.3.0"
 SpecialFunctions = "2"
 StaticArrays = "1"
 UUIDs = "1"
-julia = "1.10"
+julia = "1.11"
+
+[sources]
+GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"}
diff --git a/src/Metal.jl b/src/Metal.jl
@@ -3,6 +3,7 @@ module Metal
 using GPUArrays
 using Adapt
 using GPUCompiler
+using CompilerCaching: CacheView, lookup, results
 using GPUToolbox
 using LLVM
 using LLVM.Interop

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -4,12 +4,51 @@ struct MetalCompilerParams <: AbstractCompilerParams end
 const MetalCompilerConfig = CompilerConfig{MetalCompilerTarget, MetalCompilerParams}
 const MetalCompilerJob = CompilerJob{MetalCompilerTarget, MetalCompilerParams}
 
+"""
+    MetalResults
+
+Cached compilation results attached to each Metal `CodeInstance`. Fields are populated
+through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which
+GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`),
+`metallib` + `entry` after AIR downgrade + library wrap, and `pipelines` after the
+session-local link onto an `MTLDevice`. The first three are session-portable (cached
+through precompilation); `pipelines` is session-local.
+
+`pipelines` is a small linear cache of `(MTLDevice, MTLComputePipelineState)` pairs.
+The cache partition (via `GPUCompiler.cache_owner`) already covers the macOS / AIR /
+Metal versions that affect codegen, so the only runtime-visible dimension left is the
+`MTLDevice` that owns the linked pipeline state. A linear scan with `===` is fastest in
+the common case (n=1, single device per process) and remains cheap when multiple GPUs
+are addressed (e.g. integrated + discrete on a Mac).
+"""
+mutable struct MetalResults
+    bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}}  # (opaque_pointers, bytes)
+    metallib::Union{Nothing, Vector{UInt8}}
+    entry::Union{Nothing, String}
+    pipelines::Vector{Tuple{MTLDevice, MTLComputePipelineState}}  # session-local
+    MetalResults() = new(nothing, nothing, nothing,
+                         Tuple{MTLDevice, MTLComputePipelineState}[])
+end
+
+function GPUCompiler.bitcode(r::MetalResults, opaque_pointers::Bool)
+    r.bitcode === nothing && return nothing
+    stored, bytes = r.bitcode
+    return stored === opaque_pointers ? bytes : nothing
+end
+
+function GPUCompiler.bitcode!(r::MetalResults, bytes::Vector{UInt8}, opaque_pointers::Bool)
+    r.bitcode = (opaque_pointers, bytes)
+    return nothing
+end
+
 GPUCompiler.runtime_module(::MetalCompilerJob) = Metal
 
 GPUCompiler.method_table(::MetalCompilerJob) = method_table
 
 GPUCompiler.kernel_state_type(job::MetalCompilerJob) = KernelState
 
+GPUCompiler.results_type(::MetalCompilerJob) = MetalResults
+
 function GPUCompiler.finish_module!(@nospecialize(job::MetalCompilerJob),
                                     mod::LLVM.Module, entry::LLVM.Function)
     entry = invoke(GPUCompiler.finish_module!,
@@ -115,18 +154,7 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob),
 end
 
 
-## compiler implementation (cache, configure, compile, and link)
-
-# cache of compilation caches, per device
-const _compiler_caches = Dict{MTLDevice, Dict{Any, Any}}()
-function compiler_cache(ctx::MTLDevice)
-    cache = get(_compiler_caches, ctx, nothing)
-    if cache === nothing
-        cache = Dict{Any, Any}()
-        _compiler_caches[ctx] = cache
-    end
-    return cache
-end
+## compiler implementation (configure, compile, and link)
 
 # cache of compiler configurations, per device (but additionally configurable via kwargs)
 const _toolchain = Ref{Any}()
@@ -163,36 +191,36 @@ end
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
-# compile to executable machine code
-function compile(@nospecialize(job::CompilerJob))
+# run inference + LLVM codegen, downgrade to AIR, wrap in a Metal library.
+# returns `(metallib::Vector{UInt8}, entry::String)`, both session-portable so they
+# survive precompilation when stored on a cached `CodeInstance`.
+function compile_to_metallib(@nospecialize(job::CompilerJob))
     @signpost_event log=log_compiler() "Compile" "Job=$job"
 
     @signpost_interval log=log_compiler() "Generate LLVM IR" begin
-        # TODO: on 1.9, this actually creates a context. cache those.
         ir, entry = JuliaContext() do ctx
             mod, meta = GPUCompiler.compile(:llvm, job)
             string(mod), LLVM.name(meta.entry)
         end
     end
 
     @signpost_interval log=log_compiler() "Downgrade to AIR" begin
-        # generate AIR
         air = let
             input = Pipe()
             output = Pipe()
-            log = Pipe()
+            errlog = Pipe()
 
             cmd = `$(LLVMDowngrader_jll.llvm_as()) --bitcode-version=5.0 -o -`
-            proc = run(pipeline(cmd, stdout=output, stderr=log, stdin=input); wait=false)
+            proc = run(pipeline(cmd, stdout=output, stderr=errlog, stdin=input); wait=false)
             close(output.in)
-            close(log.in)
+            close(errlog.in)
 
             writer = @async begin
                 write(input, ir)
                 close(input)
             end
             reader = @async read(output)
-            logger = @async read(log, String)
+            logger = @async read(errlog, String)
 
             try
                 wait(proc)
@@ -237,43 +265,30 @@ function compile(@nospecialize(job::CompilerJob))
         end
     end
 
-    return (; ir, air, metallib, entry)
+    return (; metallib, entry)
 end
 
-# link into an executable kernel
-@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled)
-    @signpost_event log=log_compiler() "Link" "Job=$job"
+# link the metallib into a session-local pipeline state on the given device.
+@autoreleasepool function link_pipeline(dev::MTLDevice, metallib::Vector{UInt8}, entry::String)
+    @signpost_event log=log_compiler() "Link" entry
 
     @signpost_interval log=log_compiler() "Instantiate compute pipeline" begin
-        dev = device()
-        lib = MTLLibraryFromData(dev, compiled.metallib)
-        fun = MTLFunction(lib, compiled.entry)
-        pipeline_state = try
-            MTLComputePipelineState(dev, fun)
+        lib = MTLLibraryFromData(dev, metallib)
+        fun = MTLFunction(lib, entry)
+        try
+            return MTLComputePipelineState(dev, fun)
         catch err
             isa(err, NSError) || rethrow()
             retain(err)
 
             # the back-end compiler likely failed
-            # XXX: check more accurately? the error domain doesn't help much here
-            ir_file = tempname(cleanup=false) * ".ll"
-            write(ir_file, compiled.ir)
-            air_file = tempname(cleanup=false) * ".air"
-            write(air_file, compiled.air)
             metallib_file = tempname(cleanup=false) * ".metallib"
-            write(metallib_file, compiled.metallib)
+            write(metallib_file, metallib)
             if parse(Bool, get(ENV, "BUILDKITE", "false"))
-                run(`buildkite-agent artifact upload $(ir_file)`)
-                run(`buildkite-agent artifact upload $(air_file)`)
                 run(`buildkite-agent artifact upload $(metallib_file)`)
             end
             error("""Compilation to native code failed; see below for details.
-                     If you think this is a bug, please file an issue and attach the following files:
-                     - $(ir_file)
-                     - $(air_file)
-                     - $(metallib_file)""")
+                     If you think this is a bug, please file an issue and attach $(metallib_file)""")
         end
     end
-
-    pipeline_state
 end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -180,27 +180,52 @@ in a hot path without degrading performance. New code will be generated automati
 the function changes, or when different types or keyword arguments are provided.
 """
 function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
-    dev = device()
     Base.@lock mtlfunction_lock begin
-        # compile the function
-        cache = compiler_cache(dev)
-        source = methodinstance(F, tt)
+        dev = device()
         config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
-        pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)
+        source = methodinstance(F, tt)
+        job = CompilerJob(source, config)
+        cache = GPUCompiler.cache_view(job)
+
+        # `@something` (not the `something` function) so `compile_metal!` only runs
+        # on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs
+        # the full LLVM compile on every launch.
+        ci, res = @something lookup(cache, source) compile_metal!(cache, job)
+
+        # Resolve the MTLComputePipelineState for the active device. Linear scan
+        # over the session-local cache; almost always n=1, one `===` compare.
+        pipeline = nothing
+        @inbounds for (cached_dev, cached_pipeline) in res.pipelines
+            if cached_dev === dev
+                pipeline = cached_pipeline
+                break
+            end
+        end
+        if pipeline === nothing
+            pipeline = link_pipeline(dev, res.metallib::Vector{UInt8},
+                                     res.entry::String)
+            push!(res.pipelines, (dev, pipeline))
+        end
 
-        # create a callable object that captures the function instance. we don't need to think
-        # about world age here, as GPUCompiler already does and will return a different object
         h = hash(pipeline, hash(f, hash(tt)))
-        kernel = get(_kernel_instances, h, nothing)
-        if kernel === nothing
-            # create the kernel state object
-            kernel = HostKernel{F,tt}(f, pipeline)
-            _kernel_instances[h] = kernel
-        end
-        return kernel::HostKernel{F,tt}
+        get!(_kernel_instances, h) do
+            HostKernel{F,tt}(f, pipeline)
+        end::HostKernel{F,tt}
     end
 end
 
+# Run inference and codegen for `job`, then populate the cached `MetalResults` with the
+# session-portable artifacts. The `CodeInstance` is created during inference inside
+# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a
+# fresh `MetalResults()` attached via `@setup_caching`'s `finish!` hook.
+function compile_metal!(cache::CacheView, @nospecialize(job::CompilerJob))
+    metallib, entry = compile_to_metallib(job)
+    ci = get(cache, job.source, nothing)::Core.CodeInstance
+    res = results(cache, ci)::MetalResults
+    res.metallib, res.entry = metallib, entry
+    return (ci, res)
+end
+
 # cache of kernel instances
 const _kernel_instances = Dict{UInt, Any}()
 

diff --git a/src/precompile.jl b/src/precompile.jl
@@ -8,13 +8,7 @@ using PrecompileTools: @setup_workload, @compile_workload
     sprint(write, metallib)
 end
 
-precompile(compile, (CompilerJob,))
+precompile(compile_to_metallib, (CompilerJob,))
 precompile(Tuple{typeof(GPUCompiler.finish_ir!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function})
 precompile(Tuple{typeof(GPUCompiler.finish_module!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function})
 precompile(Tuple{typeof(GPUCompiler.check_ir), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module})
-precompile(Tuple{typeof(GPUCompiler.actual_compilation), Base.Dict{Any, Any}, Core.MethodInstance, UInt64, GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, typeof(Metal.compile), typeof(Metal.link)})
-
-# Worth the hassle
-if isdefined(Base, :Compiler) && isdefined(Base.Compiler, :typeinf_local)
-    precompile(Tuple{typeof(Base.Compiler.typeinf_local), GPUCompiler.GPUInterpreter{Base.Compiler.CachedMethodTable{Base.Compiler.OverlayMethodTable}}, Base.Compiler.InferenceState, Base.Compiler.CurrentState})
-end
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,7 +28,6 @@ steps: @@
             matrix:
               setup:
                 julia:
-                  - "1.10"
                   - "1.11"
                   - "1.12"
@@ Expand Down @@