diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 64413936d..2d0cd3b0b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -28,7 +28,6 @@ steps: matrix: setup: julia: - - "1.10" - "1.11" - "1.12" diff --git a/Project.toml b/Project.toml index 5656e7d38..362e9bc05 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" +CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" @@ -41,9 +42,10 @@ Adapt = "4.5" BFloat16s = "0.5, 0.6" CEnum = "0.4, 0.5" CodecBzip2 = "0.8.5" +CompilerCaching = "0.2.4" ExprTools = "0.1" GPUArrays = "11.5" -GPUCompiler = "1.7.1" +GPUCompiler = "1.10" GPUToolbox = "0.1, 0.2, 0.3, 1" KernelAbstractions = "0.9.38" LLVM = "7.2, 8, 9" @@ -61,4 +63,7 @@ ScopedValues = "1.3.0" SpecialFunctions = "2" StaticArrays = "1" UUIDs = "1" -julia = "1.10" +julia = "1.11" + +[sources] +GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"} diff --git a/src/Metal.jl b/src/Metal.jl index d9598bbe3..98c43747a 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -3,6 +3,7 @@ module Metal using GPUArrays using Adapt using GPUCompiler +using CompilerCaching: CacheView, lookup, results using GPUToolbox using LLVM using LLVM.Interop diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 5512059bb..6d2b4b1d5 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -4,12 +4,51 @@ struct MetalCompilerParams <: AbstractCompilerParams end const MetalCompilerConfig = CompilerConfig{MetalCompilerTarget, MetalCompilerParams} const MetalCompilerJob = CompilerJob{MetalCompilerTarget, MetalCompilerParams} +""" + MetalResults + +Cached compilation results attached to each Metal `CodeInstance`. Fields are populated +through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which +GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`), +`metallib` + `entry` after AIR downgrade + library wrap, and `pipelines` after the +session-local link onto an `MTLDevice`. The first three are session-portable (cached +through precompilation); `pipelines` is session-local. + +`pipelines` is a small linear cache of `(MTLDevice, MTLComputePipelineState)` pairs. +The cache partition (via `GPUCompiler.cache_owner`) already covers the macOS / AIR / +Metal versions that affect codegen, so the only runtime-visible dimension left is the +`MTLDevice` that owns the linked pipeline state. A linear scan with `===` is fastest in +the common case (n=1, single device per process) and remains cheap when multiple GPUs +are addressed (e.g. integrated + discrete on a Mac). +""" +mutable struct MetalResults + bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}} # (opaque_pointers, bytes) + metallib::Union{Nothing, Vector{UInt8}} + entry::Union{Nothing, String} + pipelines::Vector{Tuple{MTLDevice, MTLComputePipelineState}} # session-local + MetalResults() = new(nothing, nothing, nothing, + Tuple{MTLDevice, MTLComputePipelineState}[]) +end + +function GPUCompiler.bitcode(r::MetalResults, opaque_pointers::Bool) + r.bitcode === nothing && return nothing + stored, bytes = r.bitcode + return stored === opaque_pointers ? bytes : nothing +end + +function GPUCompiler.bitcode!(r::MetalResults, bytes::Vector{UInt8}, opaque_pointers::Bool) + r.bitcode = (opaque_pointers, bytes) + return nothing +end + GPUCompiler.runtime_module(::MetalCompilerJob) = Metal GPUCompiler.method_table(::MetalCompilerJob) = method_table GPUCompiler.kernel_state_type(job::MetalCompilerJob) = KernelState +GPUCompiler.results_type(::MetalCompilerJob) = MetalResults + function GPUCompiler.finish_module!(@nospecialize(job::MetalCompilerJob), mod::LLVM.Module, entry::LLVM.Function) entry = invoke(GPUCompiler.finish_module!, @@ -115,18 +154,7 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob), end -## compiler implementation (cache, configure, compile, and link) - -# cache of compilation caches, per device -const _compiler_caches = Dict{MTLDevice, Dict{Any, Any}}() -function compiler_cache(ctx::MTLDevice) - cache = get(_compiler_caches, ctx, nothing) - if cache === nothing - cache = Dict{Any, Any}() - _compiler_caches[ctx] = cache - end - return cache -end +## compiler implementation (configure, compile, and link) # cache of compiler configurations, per device (but additionally configurable via kwargs) const _toolchain = Ref{Any}() @@ -163,12 +191,13 @@ end CompilerConfig(target, params; kernel, name, always_inline) end -# compile to executable machine code -function compile(@nospecialize(job::CompilerJob)) +# run inference + LLVM codegen, downgrade to AIR, wrap in a Metal library. +# returns `(metallib::Vector{UInt8}, entry::String)`, both session-portable so they +# survive precompilation when stored on a cached `CodeInstance`. +function compile_to_metallib(@nospecialize(job::CompilerJob)) @signpost_event log=log_compiler() "Compile" "Job=$job" @signpost_interval log=log_compiler() "Generate LLVM IR" begin - # TODO: on 1.9, this actually creates a context. cache those. ir, entry = JuliaContext() do ctx mod, meta = GPUCompiler.compile(:llvm, job) string(mod), LLVM.name(meta.entry) @@ -176,23 +205,22 @@ function compile(@nospecialize(job::CompilerJob)) end @signpost_interval log=log_compiler() "Downgrade to AIR" begin - # generate AIR air = let input = Pipe() output = Pipe() - log = Pipe() + errlog = Pipe() cmd = `$(LLVMDowngrader_jll.llvm_as()) --bitcode-version=5.0 -o -` - proc = run(pipeline(cmd, stdout=output, stderr=log, stdin=input); wait=false) + proc = run(pipeline(cmd, stdout=output, stderr=errlog, stdin=input); wait=false) close(output.in) - close(log.in) + close(errlog.in) writer = @async begin write(input, ir) close(input) end reader = @async read(output) - logger = @async read(log, String) + logger = @async read(errlog, String) try wait(proc) @@ -237,43 +265,30 @@ function compile(@nospecialize(job::CompilerJob)) end end - return (; ir, air, metallib, entry) + return (; metallib, entry) end -# link into an executable kernel -@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled) - @signpost_event log=log_compiler() "Link" "Job=$job" +# link the metallib into a session-local pipeline state on the given device. +@autoreleasepool function link_pipeline(dev::MTLDevice, metallib::Vector{UInt8}, entry::String) + @signpost_event log=log_compiler() "Link" entry @signpost_interval log=log_compiler() "Instantiate compute pipeline" begin - dev = device() - lib = MTLLibraryFromData(dev, compiled.metallib) - fun = MTLFunction(lib, compiled.entry) - pipeline_state = try - MTLComputePipelineState(dev, fun) + lib = MTLLibraryFromData(dev, metallib) + fun = MTLFunction(lib, entry) + try + return MTLComputePipelineState(dev, fun) catch err isa(err, NSError) || rethrow() retain(err) # the back-end compiler likely failed - # XXX: check more accurately? the error domain doesn't help much here - ir_file = tempname(cleanup=false) * ".ll" - write(ir_file, compiled.ir) - air_file = tempname(cleanup=false) * ".air" - write(air_file, compiled.air) metallib_file = tempname(cleanup=false) * ".metallib" - write(metallib_file, compiled.metallib) + write(metallib_file, metallib) if parse(Bool, get(ENV, "BUILDKITE", "false")) - run(`buildkite-agent artifact upload $(ir_file)`) - run(`buildkite-agent artifact upload $(air_file)`) run(`buildkite-agent artifact upload $(metallib_file)`) end error("""Compilation to native code failed; see below for details. - If you think this is a bug, please file an issue and attach the following files: - - $(ir_file) - - $(air_file) - - $(metallib_file)""") + If you think this is a bug, please file an issue and attach $(metallib_file)""") end end - - pipeline_state end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 239ebe0bd..70688de82 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -180,27 +180,52 @@ in a hot path without degrading performance. New code will be generated automati the function changes, or when different types or keyword arguments are provided. """ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} - dev = device() Base.@lock mtlfunction_lock begin - # compile the function - cache = compiler_cache(dev) - source = methodinstance(F, tt) + dev = device() config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig - pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link) + source = methodinstance(F, tt) + job = CompilerJob(source, config) + cache = GPUCompiler.cache_view(job) + + # `@something` (not the `something` function) so `compile_metal!` only runs + # on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs + # the full LLVM compile on every launch. + ci, res = @something lookup(cache, source) compile_metal!(cache, job) + + # Resolve the MTLComputePipelineState for the active device. Linear scan + # over the session-local cache; almost always n=1, one `===` compare. + pipeline = nothing + @inbounds for (cached_dev, cached_pipeline) in res.pipelines + if cached_dev === dev + pipeline = cached_pipeline + break + end + end + if pipeline === nothing + pipeline = link_pipeline(dev, res.metallib::Vector{UInt8}, + res.entry::String) + push!(res.pipelines, (dev, pipeline)) + end - # create a callable object that captures the function instance. we don't need to think - # about world age here, as GPUCompiler already does and will return a different object h = hash(pipeline, hash(f, hash(tt))) - kernel = get(_kernel_instances, h, nothing) - if kernel === nothing - # create the kernel state object - kernel = HostKernel{F,tt}(f, pipeline) - _kernel_instances[h] = kernel - end - return kernel::HostKernel{F,tt} + get!(_kernel_instances, h) do + HostKernel{F,tt}(f, pipeline) + end::HostKernel{F,tt} end end +# Run inference and codegen for `job`, then populate the cached `MetalResults` with the +# session-portable artifacts. The `CodeInstance` is created during inference inside +# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a +# fresh `MetalResults()` attached via `@setup_caching`'s `finish!` hook. +function compile_metal!(cache::CacheView, @nospecialize(job::CompilerJob)) + metallib, entry = compile_to_metallib(job) + ci = get(cache, job.source, nothing)::Core.CodeInstance + res = results(cache, ci)::MetalResults + res.metallib, res.entry = metallib, entry + return (ci, res) +end + # cache of kernel instances const _kernel_instances = Dict{UInt, Any}() diff --git a/src/precompile.jl b/src/precompile.jl index 296c3a4a1..138568667 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -8,13 +8,7 @@ using PrecompileTools: @setup_workload, @compile_workload sprint(write, metallib) end -precompile(compile, (CompilerJob,)) +precompile(compile_to_metallib, (CompilerJob,)) precompile(Tuple{typeof(GPUCompiler.finish_ir!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function}) precompile(Tuple{typeof(GPUCompiler.finish_module!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function}) precompile(Tuple{typeof(GPUCompiler.check_ir), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module}) -precompile(Tuple{typeof(GPUCompiler.actual_compilation), Base.Dict{Any, Any}, Core.MethodInstance, UInt64, GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, typeof(Metal.compile), typeof(Metal.link)}) - -# Worth the hassle -if isdefined(Base, :Compiler) && isdefined(Base.Compiler, :typeinf_local) - precompile(Tuple{typeof(Base.Compiler.typeinf_local), GPUCompiler.GPUInterpreter{Base.Compiler.CachedMethodTable{Base.Compiler.OverlayMethodTable}}, Base.Compiler.InferenceState, Base.Compiler.CurrentState}) -end