From 8a37a8072a8af090e426d0e4442e8da86200ba0a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sun, 10 May 2026 08:33:26 +0200 Subject: [PATCH 1/6] Switch to the new GPUCompiler caching API. Replace `cached_compilation` with a `MetalResults` struct attached to each `CodeInstance` via `CompilerCaching`: `metallib` + entry name are session-portable (cached through precompilation), and the `MTLComputePipelineState` is materialized lazily per session. Co-Authored-By: Claude Opus 4.7 (1M context) --- Project.toml | 4 +- src/Metal.jl | 1 + src/compiler/compilation.jl | 78 +++++++++++++++++-------------------- src/compiler/execution.jl | 39 ++++++++++++------- src/precompile.jl | 8 +--- 5 files changed, 66 insertions(+), 64 deletions(-) diff --git a/Project.toml b/Project.toml index 5656e7d38..c08ec526c 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" +CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" @@ -41,9 +42,10 @@ Adapt = "4.5" BFloat16s = "0.5, 0.6" CEnum = "0.4, 0.5" CodecBzip2 = "0.8.5" +CompilerCaching = "0.2.4" ExprTools = "0.1" GPUArrays = "11.5" -GPUCompiler = "1.7.1" +GPUCompiler = "1.10" GPUToolbox = "0.1, 0.2, 0.3, 1" KernelAbstractions = "0.9.38" LLVM = "7.2, 8, 9" diff --git a/src/Metal.jl b/src/Metal.jl index d9598bbe3..98c43747a 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -3,6 +3,7 @@ module Metal using GPUArrays using Adapt using GPUCompiler +using CompilerCaching: CacheView, lookup, results using GPUToolbox using LLVM using LLVM.Interop diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 5512059bb..71a284e17 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -4,12 +4,29 @@ struct MetalCompilerParams <: AbstractCompilerParams end const MetalCompilerConfig = CompilerConfig{MetalCompilerTarget, MetalCompilerParams} const MetalCompilerJob = CompilerJob{MetalCompilerTarget, MetalCompilerParams} +""" + MetalResults + +Cached compilation results attached to each Metal `CodeInstance`. `metallib` and `entry` +are session-portable (serialized via package precompilation through CompilerCaching); +`pipeline` is a session-local handle that's re-linked from `metallib` after a fresh +session load. +""" +mutable struct MetalResults + metallib::Union{Nothing, Vector{UInt8}} + entry::Union{Nothing, String} + pipeline::Any # MTLComputePipelineState — populated lazily, not serialized + MetalResults() = new(nothing, nothing, nothing) +end + GPUCompiler.runtime_module(::MetalCompilerJob) = Metal GPUCompiler.method_table(::MetalCompilerJob) = method_table GPUCompiler.kernel_state_type(job::MetalCompilerJob) = KernelState +GPUCompiler.results_type(::MetalCompilerJob) = MetalResults + function GPUCompiler.finish_module!(@nospecialize(job::MetalCompilerJob), mod::LLVM.Module, entry::LLVM.Function) entry = invoke(GPUCompiler.finish_module!, @@ -115,18 +132,7 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob), end -## compiler implementation (cache, configure, compile, and link) - -# cache of compilation caches, per device -const _compiler_caches = Dict{MTLDevice, Dict{Any, Any}}() -function compiler_cache(ctx::MTLDevice) - cache = get(_compiler_caches, ctx, nothing) - if cache === nothing - cache = Dict{Any, Any}() - _compiler_caches[ctx] = cache - end - return cache -end +## compiler implementation (configure, compile, and link) # cache of compiler configurations, per device (but additionally configurable via kwargs) const _toolchain = Ref{Any}() @@ -163,12 +169,13 @@ end CompilerConfig(target, params; kernel, name, always_inline) end -# compile to executable machine code -function compile(@nospecialize(job::CompilerJob)) +# run inference + LLVM codegen, downgrade to AIR, wrap in a Metal library. +# returns `(metallib::Vector{UInt8}, entry::String)`, both session-portable so they +# survive precompilation when stored on a cached `CodeInstance`. +function compile_to_metallib(@nospecialize(job::CompilerJob)) @signpost_event log=log_compiler() "Compile" "Job=$job" @signpost_interval log=log_compiler() "Generate LLVM IR" begin - # TODO: on 1.9, this actually creates a context. cache those. ir, entry = JuliaContext() do ctx mod, meta = GPUCompiler.compile(:llvm, job) string(mod), LLVM.name(meta.entry) @@ -176,23 +183,22 @@ function compile(@nospecialize(job::CompilerJob)) end @signpost_interval log=log_compiler() "Downgrade to AIR" begin - # generate AIR air = let input = Pipe() output = Pipe() - log = Pipe() + errlog = Pipe() cmd = `$(LLVMDowngrader_jll.llvm_as()) --bitcode-version=5.0 -o -` - proc = run(pipeline(cmd, stdout=output, stderr=log, stdin=input); wait=false) + proc = run(pipeline(cmd, stdout=output, stderr=errlog, stdin=input); wait=false) close(output.in) - close(log.in) + close(errlog.in) writer = @async begin write(input, ir) close(input) end reader = @async read(output) - logger = @async read(log, String) + logger = @async read(errlog, String) try wait(proc) @@ -237,43 +243,31 @@ function compile(@nospecialize(job::CompilerJob)) end end - return (; ir, air, metallib, entry) + return (; metallib, entry) end -# link into an executable kernel -@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled) - @signpost_event log=log_compiler() "Link" "Job=$job" +# link the metallib into a session-local pipeline state on the active device. +@autoreleasepool function link_pipeline(metallib::Vector{UInt8}, entry::String) + @signpost_event log=log_compiler() "Link" entry @signpost_interval log=log_compiler() "Instantiate compute pipeline" begin dev = device() - lib = MTLLibraryFromData(dev, compiled.metallib) - fun = MTLFunction(lib, compiled.entry) - pipeline_state = try - MTLComputePipelineState(dev, fun) + lib = MTLLibraryFromData(dev, metallib) + fun = MTLFunction(lib, entry) + try + return MTLComputePipelineState(dev, fun) catch err isa(err, NSError) || rethrow() retain(err) # the back-end compiler likely failed - # XXX: check more accurately? the error domain doesn't help much here - ir_file = tempname(cleanup=false) * ".ll" - write(ir_file, compiled.ir) - air_file = tempname(cleanup=false) * ".air" - write(air_file, compiled.air) metallib_file = tempname(cleanup=false) * ".metallib" - write(metallib_file, compiled.metallib) + write(metallib_file, metallib) if parse(Bool, get(ENV, "BUILDKITE", "false")) - run(`buildkite-agent artifact upload $(ir_file)`) - run(`buildkite-agent artifact upload $(air_file)`) run(`buildkite-agent artifact upload $(metallib_file)`) end error("""Compilation to native code failed; see below for details. - If you think this is a bug, please file an issue and attach the following files: - - $(ir_file) - - $(air_file) - - $(metallib_file)""") + If you think this is a bug, please file an issue and attach $(metallib_file)""") end end - - pipeline_state end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 239ebe0bd..64a75829c 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -180,27 +180,38 @@ in a hot path without degrading performance. New code will be generated automati the function changes, or when different types or keyword arguments are provided. """ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} - dev = device() Base.@lock mtlfunction_lock begin - # compile the function - cache = compiler_cache(dev) + config = compiler_config(device(); name, kwargs...)::MetalCompilerConfig source = methodinstance(F, tt) - config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig - pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link) + job = CompilerJob(source, config) + cache = GPUCompiler.cache_view(job) - # create a callable object that captures the function instance. we don't need to think - # about world age here, as GPUCompiler already does and will return a different object - h = hash(pipeline, hash(f, hash(tt))) - kernel = get(_kernel_instances, h, nothing) - if kernel === nothing - # create the kernel state object - kernel = HostKernel{F,tt}(f, pipeline) - _kernel_instances[h] = kernel + ci, res = something(lookup(cache, source), compile_metal!(cache, job)) + if res.pipeline === nothing + res.pipeline = link_pipeline(res.metallib::Vector{UInt8}, + res.entry::String) end - return kernel::HostKernel{F,tt} + pipeline = res.pipeline::MTLComputePipelineState + + h = hash(pipeline, hash(f, hash(tt))) + get!(_kernel_instances, h) do + HostKernel{F,tt}(f, pipeline) + end::HostKernel{F,tt} end end +# Run inference and codegen for `job`, then populate the cached `MetalResults` with the +# session-portable artifacts. The `CodeInstance` is created during inference inside +# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a +# fresh `MetalResults()` attached via `@setup_caching`'s `finish!` hook. +function compile_metal!(cache::CacheView, @nospecialize(job::CompilerJob)) + metallib, entry = compile_to_metallib(job) + ci = get(cache, job.source, nothing)::Core.CodeInstance + res = results(cache, ci)::MetalResults + res.metallib, res.entry = metallib, entry + return (ci, res) +end + # cache of kernel instances const _kernel_instances = Dict{UInt, Any}() diff --git a/src/precompile.jl b/src/precompile.jl index 296c3a4a1..138568667 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -8,13 +8,7 @@ using PrecompileTools: @setup_workload, @compile_workload sprint(write, metallib) end -precompile(compile, (CompilerJob,)) +precompile(compile_to_metallib, (CompilerJob,)) precompile(Tuple{typeof(GPUCompiler.finish_ir!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function}) precompile(Tuple{typeof(GPUCompiler.finish_module!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function}) precompile(Tuple{typeof(GPUCompiler.check_ir), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module}) -precompile(Tuple{typeof(GPUCompiler.actual_compilation), Base.Dict{Any, Any}, Core.MethodInstance, UInt64, GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, typeof(Metal.compile), typeof(Metal.link)}) - -# Worth the hassle -if isdefined(Base, :Compiler) && isdefined(Base.Compiler, :typeinf_local) - precompile(Tuple{typeof(Base.Compiler.typeinf_local), GPUCompiler.GPUInterpreter{Base.Compiler.CachedMethodTable{Base.Compiler.OverlayMethodTable}}, Base.Compiler.InferenceState, Base.Compiler.CurrentState}) -end From 1dc2dcf89272f3a14c99f49852c7f3c6e9779daa Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sun, 10 May 2026 12:31:14 +0200 Subject: [PATCH 2/6] Opt in to GPUCompiler runtime bitcode caching. Adds a `bitcode` field to `MetalResults` and overrides `GPUCompiler.bitcode` / `bitcode!`. Per-function runtime library bitcode now rides on the same precompilation path as `metallib`/`entry`, so cross-session loads can skip the runtime rebuild. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/compiler/compilation.jl | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 71a284e17..77c279287 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -7,16 +7,30 @@ const MetalCompilerJob = CompilerJob{MetalCompilerTarget, MetalCompilerParams} """ MetalResults -Cached compilation results attached to each Metal `CodeInstance`. `metallib` and `entry` -are session-portable (serialized via package precompilation through CompilerCaching); -`pipeline` is a session-local handle that's re-linked from `metallib` after a fresh -session load. +Cached compilation results attached to each Metal `CodeInstance`. Fields are populated +through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which +GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`), +`metallib` + `entry` after AIR downgrade + library wrap, `pipeline` after the +session-local link onto an `MTLDevice`. The first three are session-portable (cached +through precompilation); `pipeline` is session-local. """ mutable struct MetalResults + bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}} # (opaque_pointers, bytes) metallib::Union{Nothing, Vector{UInt8}} entry::Union{Nothing, String} pipeline::Any # MTLComputePipelineState — populated lazily, not serialized - MetalResults() = new(nothing, nothing, nothing) + MetalResults() = new(nothing, nothing, nothing, nothing) +end + +function GPUCompiler.bitcode(r::MetalResults, opaque_pointers::Bool) + r.bitcode === nothing && return nothing + stored, bytes = r.bitcode + return stored === opaque_pointers ? bytes : nothing +end + +function GPUCompiler.bitcode!(r::MetalResults, bytes::Vector{UInt8}, opaque_pointers::Bool) + r.bitcode = (opaque_pointers, bytes) + return nothing end GPUCompiler.runtime_module(::MetalCompilerJob) = Metal From 96574be8ee3132a2ac295637d07ae3e44ced8aac Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sun, 10 May 2026 20:30:03 +0200 Subject: [PATCH 3/6] Cache linked pipelines per-device. Replace the single `pipeline` slot on `MetalResults` with a small linear cache of `(MTLDevice, MTLComputePipelineState)` pairs. The cache partition already covers the macOS / AIR / Metal versions that affect codegen, but two `MTLDevice`s on a single Mac (e.g. integrated + discrete) share the same `metallib` and need separate `MTLComputePipelineState`s. Hot-path cost is unchanged: one field load + one `===` compare. The common case (single device) stays at n=1. `link_pipeline` now takes the target `MTLDevice` explicitly instead of calling `device()` internally, so the call site captures the device once under `mtlfunction_lock`. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/compiler/compilation.jl | 21 ++++++++++++++------- src/compiler/execution.jl | 21 ++++++++++++++++----- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 77c279287..6d2b4b1d5 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -10,16 +10,24 @@ const MetalCompilerJob = CompilerJob{MetalCompilerTarget, MetalCompilerParams} Cached compilation results attached to each Metal `CodeInstance`. Fields are populated through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`), -`metallib` + `entry` after AIR downgrade + library wrap, `pipeline` after the +`metallib` + `entry` after AIR downgrade + library wrap, and `pipelines` after the session-local link onto an `MTLDevice`. The first three are session-portable (cached -through precompilation); `pipeline` is session-local. +through precompilation); `pipelines` is session-local. + +`pipelines` is a small linear cache of `(MTLDevice, MTLComputePipelineState)` pairs. +The cache partition (via `GPUCompiler.cache_owner`) already covers the macOS / AIR / +Metal versions that affect codegen, so the only runtime-visible dimension left is the +`MTLDevice` that owns the linked pipeline state. A linear scan with `===` is fastest in +the common case (n=1, single device per process) and remains cheap when multiple GPUs +are addressed (e.g. integrated + discrete on a Mac). """ mutable struct MetalResults bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}} # (opaque_pointers, bytes) metallib::Union{Nothing, Vector{UInt8}} entry::Union{Nothing, String} - pipeline::Any # MTLComputePipelineState — populated lazily, not serialized - MetalResults() = new(nothing, nothing, nothing, nothing) + pipelines::Vector{Tuple{MTLDevice, MTLComputePipelineState}} # session-local + MetalResults() = new(nothing, nothing, nothing, + Tuple{MTLDevice, MTLComputePipelineState}[]) end function GPUCompiler.bitcode(r::MetalResults, opaque_pointers::Bool) @@ -260,12 +268,11 @@ function compile_to_metallib(@nospecialize(job::CompilerJob)) return (; metallib, entry) end -# link the metallib into a session-local pipeline state on the active device. -@autoreleasepool function link_pipeline(metallib::Vector{UInt8}, entry::String) +# link the metallib into a session-local pipeline state on the given device. +@autoreleasepool function link_pipeline(dev::MTLDevice, metallib::Vector{UInt8}, entry::String) @signpost_event log=log_compiler() "Link" entry @signpost_interval log=log_compiler() "Instantiate compute pipeline" begin - dev = device() lib = MTLLibraryFromData(dev, metallib) fun = MTLFunction(lib, entry) try diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 64a75829c..211d59e5a 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -181,17 +181,28 @@ the function changes, or when different types or keyword arguments are provided. """ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Base.@lock mtlfunction_lock begin - config = compiler_config(device(); name, kwargs...)::MetalCompilerConfig + dev = device() + config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig source = methodinstance(F, tt) job = CompilerJob(source, config) cache = GPUCompiler.cache_view(job) ci, res = something(lookup(cache, source), compile_metal!(cache, job)) - if res.pipeline === nothing - res.pipeline = link_pipeline(res.metallib::Vector{UInt8}, - res.entry::String) + + # Resolve the MTLComputePipelineState for the active device. Linear scan + # over the session-local cache; almost always n=1, one `===` compare. + pipeline = nothing + @inbounds for (cached_dev, cached_pipeline) in res.pipelines + if cached_dev === dev + pipeline = cached_pipeline + break + end + end + if pipeline === nothing + pipeline = link_pipeline(dev, res.metallib::Vector{UInt8}, + res.entry::String) + push!(res.pipelines, (dev, pipeline)) end - pipeline = res.pipeline::MTLComputePipelineState h = hash(pipeline, hash(f, hash(tt))) get!(_kernel_instances, h) do From 9edfea00c44464fd8ccedbcde4c491c78d1ecf72 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sun, 10 May 2026 20:42:52 +0200 Subject: [PATCH 4/6] Avoid re-running compile on every cached `mtlfunction` call. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `something(lookup(...), compile_metal!(...))` evaluated `compile_metal!` eagerly even on a cache hit, so every kernel launch silently re-ran the full LLVM compile pipeline. Branch explicitly on the lookup result. Warm-cache `mtlfunction` cost: ~3.4 ms → ~380 ns. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/compiler/execution.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 211d59e5a..70688de82 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -187,7 +187,10 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} job = CompilerJob(source, config) cache = GPUCompiler.cache_view(job) - ci, res = something(lookup(cache, source), compile_metal!(cache, job)) + # `@something` (not the `something` function) so `compile_metal!` only runs + # on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs + # the full LLVM compile on every launch. + ci, res = @something lookup(cache, source) compile_metal!(cache, job) # Resolve the MTLComputePipelineState for the active device. Linear scan # over the session-local cache; almost always n=1, one `===` compare. From 0ac061dca96eef7fa532aaed1742b5697ace30d2 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 12 May 2026 11:40:30 +0200 Subject: [PATCH 5/6] Use development version of GPUCompiler.jl. --- Project.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Project.toml b/Project.toml index c08ec526c..fbeac9b0c 100644 --- a/Project.toml +++ b/Project.toml @@ -64,3 +64,6 @@ SpecialFunctions = "2" StaticArrays = "1" UUIDs = "1" julia = "1.10" + +[sources] +GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"} From 00d69c1c094e4bd8fe6f3553910cf5b136d4d31c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 12 May 2026 11:41:22 +0200 Subject: [PATCH 6/6] Remove 1.10 from CI etc. --- .buildkite/pipeline.yml | 1 - Project.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 64413936d..2d0cd3b0b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -28,7 +28,6 @@ steps: matrix: setup: julia: - - "1.10" - "1.11" - "1.12" diff --git a/Project.toml b/Project.toml index fbeac9b0c..362e9bc05 100644 --- a/Project.toml +++ b/Project.toml @@ -63,7 +63,7 @@ ScopedValues = "1.3.0" SpecialFunctions = "2" StaticArrays = "1" UUIDs = "1" -julia = "1.10" +julia = "1.11" [sources] GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"}