Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1.11"
- "1.12"

Expand Down
9 changes: 7 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
Expand Down Expand Up @@ -41,9 +42,10 @@ Adapt = "4.5"
BFloat16s = "0.5, 0.6"
CEnum = "0.4, 0.5"
CodecBzip2 = "0.8.5"
CompilerCaching = "0.2.4"
ExprTools = "0.1"
GPUArrays = "11.5"
GPUCompiler = "1.7.1"
GPUCompiler = "1.10"
GPUToolbox = "0.1, 0.2, 0.3, 1"
KernelAbstractions = "0.9.38"
LLVM = "7.2, 8, 9"
Expand All @@ -61,4 +63,7 @@ ScopedValues = "1.3.0"
SpecialFunctions = "2"
StaticArrays = "1"
UUIDs = "1"
julia = "1.10"
julia = "1.11"

[sources]
GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"}
1 change: 1 addition & 0 deletions src/Metal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module Metal
using GPUArrays
using Adapt
using GPUCompiler
using CompilerCaching: CacheView, lookup, results
using GPUToolbox
using LLVM
using LLVM.Interop
Expand Down
101 changes: 58 additions & 43 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,51 @@ struct MetalCompilerParams <: AbstractCompilerParams end
const MetalCompilerConfig = CompilerConfig{MetalCompilerTarget, MetalCompilerParams}
const MetalCompilerJob = CompilerJob{MetalCompilerTarget, MetalCompilerParams}

"""
MetalResults

Cached compilation results attached to each Metal `CodeInstance`. Fields are populated
through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which
GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`),
`metallib` + `entry` after AIR downgrade + library wrap, and `pipelines` after the
session-local link onto an `MTLDevice`. The first three are session-portable (cached
through precompilation); `pipelines` is session-local.

`pipelines` is a small linear cache of `(MTLDevice, MTLComputePipelineState)` pairs.
The cache partition (via `GPUCompiler.cache_owner`) already covers the macOS / AIR /
Metal versions that affect codegen, so the only runtime-visible dimension left is the
`MTLDevice` that owns the linked pipeline state. A linear scan with `===` is fastest in
the common case (n=1, single device per process) and remains cheap when multiple GPUs
are addressed (e.g. integrated + discrete on a Mac).
"""
mutable struct MetalResults
bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}} # (opaque_pointers, bytes)
metallib::Union{Nothing, Vector{UInt8}}
entry::Union{Nothing, String}
pipelines::Vector{Tuple{MTLDevice, MTLComputePipelineState}} # session-local
MetalResults() = new(nothing, nothing, nothing,
Tuple{MTLDevice, MTLComputePipelineState}[])
end

function GPUCompiler.bitcode(r::MetalResults, opaque_pointers::Bool)
r.bitcode === nothing && return nothing
stored, bytes = r.bitcode
return stored === opaque_pointers ? bytes : nothing
end

function GPUCompiler.bitcode!(r::MetalResults, bytes::Vector{UInt8}, opaque_pointers::Bool)
r.bitcode = (opaque_pointers, bytes)
return nothing
end

GPUCompiler.runtime_module(::MetalCompilerJob) = Metal

GPUCompiler.method_table(::MetalCompilerJob) = method_table

GPUCompiler.kernel_state_type(job::MetalCompilerJob) = KernelState

GPUCompiler.results_type(::MetalCompilerJob) = MetalResults

function GPUCompiler.finish_module!(@nospecialize(job::MetalCompilerJob),
mod::LLVM.Module, entry::LLVM.Function)
entry = invoke(GPUCompiler.finish_module!,
Expand Down Expand Up @@ -115,18 +154,7 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob),
end


## compiler implementation (cache, configure, compile, and link)

# cache of compilation caches, per device
const _compiler_caches = Dict{MTLDevice, Dict{Any, Any}}()
function compiler_cache(ctx::MTLDevice)
cache = get(_compiler_caches, ctx, nothing)
if cache === nothing
cache = Dict{Any, Any}()
_compiler_caches[ctx] = cache
end
return cache
end
## compiler implementation (configure, compile, and link)

# cache of compiler configurations, per device (but additionally configurable via kwargs)
const _toolchain = Ref{Any}()
Expand Down Expand Up @@ -163,36 +191,36 @@ end
CompilerConfig(target, params; kernel, name, always_inline)
end

# compile to executable machine code
function compile(@nospecialize(job::CompilerJob))
# run inference + LLVM codegen, downgrade to AIR, wrap in a Metal library.
# returns `(metallib::Vector{UInt8}, entry::String)`, both session-portable so they
# survive precompilation when stored on a cached `CodeInstance`.
function compile_to_metallib(@nospecialize(job::CompilerJob))
@signpost_event log=log_compiler() "Compile" "Job=$job"

@signpost_interval log=log_compiler() "Generate LLVM IR" begin
# TODO: on 1.9, this actually creates a context. cache those.
ir, entry = JuliaContext() do ctx
mod, meta = GPUCompiler.compile(:llvm, job)
string(mod), LLVM.name(meta.entry)
end
end

@signpost_interval log=log_compiler() "Downgrade to AIR" begin
# generate AIR
air = let
input = Pipe()
output = Pipe()
log = Pipe()
errlog = Pipe()

cmd = `$(LLVMDowngrader_jll.llvm_as()) --bitcode-version=5.0 -o -`
proc = run(pipeline(cmd, stdout=output, stderr=log, stdin=input); wait=false)
proc = run(pipeline(cmd, stdout=output, stderr=errlog, stdin=input); wait=false)
close(output.in)
close(log.in)
close(errlog.in)

writer = @async begin
write(input, ir)
close(input)
end
reader = @async read(output)
logger = @async read(log, String)
logger = @async read(errlog, String)

try
wait(proc)
Expand Down Expand Up @@ -237,43 +265,30 @@ function compile(@nospecialize(job::CompilerJob))
end
end

return (; ir, air, metallib, entry)
return (; metallib, entry)
end

# link into an executable kernel
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled)
@signpost_event log=log_compiler() "Link" "Job=$job"
# link the metallib into a session-local pipeline state on the given device.
@autoreleasepool function link_pipeline(dev::MTLDevice, metallib::Vector{UInt8}, entry::String)
@signpost_event log=log_compiler() "Link" entry

@signpost_interval log=log_compiler() "Instantiate compute pipeline" begin
dev = device()
lib = MTLLibraryFromData(dev, compiled.metallib)
fun = MTLFunction(lib, compiled.entry)
pipeline_state = try
MTLComputePipelineState(dev, fun)
lib = MTLLibraryFromData(dev, metallib)
fun = MTLFunction(lib, entry)
try
return MTLComputePipelineState(dev, fun)
catch err
isa(err, NSError) || rethrow()
retain(err)

# the back-end compiler likely failed
# XXX: check more accurately? the error domain doesn't help much here
ir_file = tempname(cleanup=false) * ".ll"
write(ir_file, compiled.ir)
air_file = tempname(cleanup=false) * ".air"
write(air_file, compiled.air)
metallib_file = tempname(cleanup=false) * ".metallib"
write(metallib_file, compiled.metallib)
write(metallib_file, metallib)
if parse(Bool, get(ENV, "BUILDKITE", "false"))
run(`buildkite-agent artifact upload $(ir_file)`)
run(`buildkite-agent artifact upload $(air_file)`)
run(`buildkite-agent artifact upload $(metallib_file)`)
end
error("""Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach the following files:
- $(ir_file)
- $(air_file)
- $(metallib_file)""")
If you think this is a bug, please file an issue and attach $(metallib_file)""")
end
end

pipeline_state
end
53 changes: 39 additions & 14 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,27 +180,52 @@ in a hot path without degrading performance. New code will be generated automati
the function changes, or when different types or keyword arguments are provided.
"""
function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
dev = device()
Base.@lock mtlfunction_lock begin
# compile the function
cache = compiler_cache(dev)
source = methodinstance(F, tt)
dev = device()
config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)
source = methodinstance(F, tt)
job = CompilerJob(source, config)
cache = GPUCompiler.cache_view(job)

# `@something` (not the `something` function) so `compile_metal!` only runs
# on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs
# the full LLVM compile on every launch.
ci, res = @something lookup(cache, source) compile_metal!(cache, job)

# Resolve the MTLComputePipelineState for the active device. Linear scan
# over the session-local cache; almost always n=1, one `===` compare.
pipeline = nothing
@inbounds for (cached_dev, cached_pipeline) in res.pipelines
if cached_dev === dev
pipeline = cached_pipeline
break
end
end
if pipeline === nothing
pipeline = link_pipeline(dev, res.metallib::Vector{UInt8},
res.entry::String)
push!(res.pipelines, (dev, pipeline))
end

# create a callable object that captures the function instance. we don't need to think
# about world age here, as GPUCompiler already does and will return a different object
h = hash(pipeline, hash(f, hash(tt)))
kernel = get(_kernel_instances, h, nothing)
if kernel === nothing
# create the kernel state object
kernel = HostKernel{F,tt}(f, pipeline)
_kernel_instances[h] = kernel
end
return kernel::HostKernel{F,tt}
get!(_kernel_instances, h) do
HostKernel{F,tt}(f, pipeline)
end::HostKernel{F,tt}
end
end

# Run inference and codegen for `job`, then populate the cached `MetalResults` with the
# session-portable artifacts. The `CodeInstance` is created during inference inside
# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a
# fresh `MetalResults()` attached via `@setup_caching`'s `finish!` hook.
function compile_metal!(cache::CacheView, @nospecialize(job::CompilerJob))
metallib, entry = compile_to_metallib(job)
ci = get(cache, job.source, nothing)::Core.CodeInstance
res = results(cache, ci)::MetalResults
res.metallib, res.entry = metallib, entry
return (ci, res)
end

# cache of kernel instances
const _kernel_instances = Dict{UInt, Any}()

Expand Down
8 changes: 1 addition & 7 deletions src/precompile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,7 @@ using PrecompileTools: @setup_workload, @compile_workload
sprint(write, metallib)
end

precompile(compile, (CompilerJob,))
precompile(compile_to_metallib, (CompilerJob,))
precompile(Tuple{typeof(GPUCompiler.finish_ir!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function})
precompile(Tuple{typeof(GPUCompiler.finish_module!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function})
precompile(Tuple{typeof(GPUCompiler.check_ir), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module})
precompile(Tuple{typeof(GPUCompiler.actual_compilation), Base.Dict{Any, Any}, Core.MethodInstance, UInt64, GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, typeof(Metal.compile), typeof(Metal.link)})

# Worth the hassle
if isdefined(Base, :Compiler) && isdefined(Base.Compiler, :typeinf_local)
precompile(Tuple{typeof(Base.Compiler.typeinf_local), GPUCompiler.GPUInterpreter{Base.Compiler.CachedMethodTable{Base.Compiler.OverlayMethodTable}}, Base.Compiler.InferenceState, Base.Compiler.CurrentState})
end