diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 36361b4a..a939f808 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -53,5 +53,5 @@ steps: matrix: setup: julia: - - "1.10" + - "1.11" - "1.12" diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index fcafe9c1..81d88a27 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -22,7 +22,7 @@ jobs: strategy: fail-fast: false matrix: - version: ['1.10', '1.12'] + version: ['1.11', '1.12'] os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-15-intel, windows-2022] arch: [x64, arm64] pocl: [jll, local] diff --git a/Project.toml b/Project.toml index 828e36be..bd6c7afe 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ version = "0.10.9" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" @@ -26,10 +27,11 @@ SPIRVIntrinsics = {path = "lib/intrinsics"} [compat] Adapt = "4" +CompilerCaching = "0.2.4" GPUArrays = "11.2.1" -GPUCompiler = "1.7.1" +GPUCompiler = "1.10" KernelAbstractions = "0.9.38" -LLVM = "9.1" +LLVM = "9.6" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" Preferences = "1" @@ -42,4 +44,7 @@ SPIRVIntrinsics = "0.5.7" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2025.1" StaticArrays = "1" -julia = "1.10" +julia = "1.11" + +[sources] +GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"} diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 7b2a52d9..6f58caf0 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -1,6 +1,7 @@ module OpenCL using GPUCompiler +using CompilerCaching: CacheView, lookup, results using LLVM, LLVM.Interop using SPIRV_LLVM_Backend_jll, SPIRV_Tools_jll using Adapt diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 5f9ae484..89eac9af 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -7,8 +7,46 @@ end const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams} +""" + OpenCLResults + +Cached compilation results attached to each OpenCL `CodeInstance`. Fields are populated +through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which +GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`), +`obj` (SPIR-V bytes) + `entry` + `device_rng` after main codegen, and `kernels` after +the session-local link onto an OpenCL context. The first four are session-portable +(cached through precompilation); `kernels` is session-local. + +`kernels` is a small linear cache of `(cl.Context, cl.Kernel)` pairs. The cache partition +already covers everything that affects codegen via `GPUCompiler.cache_owner`, so the only +runtime-visible dimension left is the OpenCL context that owns the linked `cl.Kernel`. +A linear scan with `===` is fastest in the common case (n=1) and stays cheap for the +rare workload that bounces between a handful of contexts on the same device. +""" +mutable struct OpenCLResults + bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}} # (opaque_pointers, bytes) + obj::Union{Nothing, Vector{UInt8}} # SPIR-V binary + entry::Union{Nothing, String} + device_rng::Bool + kernels::Vector{Tuple{cl.Context, cl.Kernel}} # session-local; linear-scanned + OpenCLResults() = new(nothing, nothing, nothing, false, Tuple{cl.Context, cl.Kernel}[]) +end + +function GPUCompiler.bitcode(r::OpenCLResults, opaque_pointers::Bool) + r.bitcode === nothing && return nothing + stored, bytes = r.bitcode + return stored === opaque_pointers ? bytes : nothing +end + +function GPUCompiler.bitcode!(r::OpenCLResults, bytes::Vector{UInt8}, opaque_pointers::Bool) + r.bitcode = (opaque_pointers, bytes) + return nothing +end + GPUCompiler.runtime_module(::CompilerJob{<:Any,OpenCLCompilerParams}) = OpenCL +GPUCompiler.results_type(::OpenCLCompilerJob) = OpenCLResults + GPUCompiler.method_table_view(job::OpenCLCompilerJob) = GPUCompiler.StackedMethodTable(job.world, method_table, SPIRVIntrinsics.method_table) @@ -111,18 +149,7 @@ function GPUCompiler.finish_linked_module!(@nospecialize(job::OpenCLCompilerJob) return end -## compiler implementation (cache, configure, compile, and link) - -# cache of compilation caches, per context -const _compiler_caches = Dict{cl.Context, Dict{Any, Any}}() -function compiler_cache(ctx::cl.Context) - cache = get(_compiler_caches, ctx, nothing) - if cache === nothing - cache = Dict{Any, Any}() - _compiler_caches[ctx] = cache - end - return cache -end +## compiler implementation (configure, compile, and link) # cache of compiler configurations, per device (but additionally configurable via kwargs) const _toolchain = Ref{Any}() @@ -153,37 +180,35 @@ end CompilerConfig(target, params; kernel, name, always_inline) end -# compile to executable machine code +# run inference + LLVM codegen + SPIR-V emission. returns `(obj, entry, device_rng)`, +# all session-portable so they survive precompilation when stored on a cached `CodeInstance`. const compilations = Threads.Atomic{Int}(0) -function compile(@nospecialize(job::CompilerJob)) +function compile_to_obj(@nospecialize(job::CompilerJob)) compilations[] += 1 - # TODO: this creates a context; cache those. - obj, meta = JuliaContext() do ctx + JuliaContext() do ctx obj, meta = GPUCompiler.compile(:obj, job) - entry = LLVM.name(meta.entry) device_rng = StringAttribute("julia.opencl.rng", "") in collect(function_attributes(meta.entry)) - (; obj, entry, device_rng) end end -# link into an executable kernel -function link(@nospecialize(job::CompilerJob), compiled) +# link the SPIR-V bytes into a session-local `cl.Kernel` on the active context. +function link_kernel(obj::Vector{UInt8}, entry::String) prog = if "cl_khr_il_program" in cl.device().extensions - cl.Program(; il=compiled.obj) + cl.Program(; il=obj) else error("Your device does not support SPIR-V, which is currently required for native execution.") # XXX: kpet/spirv2clc#87, caused by KhronosGroup/SPIRV-LLVM-Translator#2029 source = mktempdir() do dir il = joinpath(dir, "kernel.spv") - write(il, compiled.obj) + write(il, obj) cmd = `spirv2clc $il` read(cmd, String) end cl.Program(; source) end cl.build!(prog) - (; kernel=cl.Kernel(prog, compiled.entry), compiled.device_rng) + return cl.Kernel(prog, entry) end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 68885b8f..95e31fca 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -180,28 +180,52 @@ end const clfunction_lock = ReentrantLock() function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} - ctx = cl.context() - dev = cl.device() - Base.@lock clfunction_lock begin - # compile the function - cache = compiler_cache(ctx) + config = compiler_config(cl.device(); kwargs...)::OpenCLCompilerConfig source = methodinstance(F, tt) - config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig - linked = GPUCompiler.cached_compilation(cache, source, config, compile, link) - - # create a callable object that captures the function instance. we don't need to think - # about world age here, as GPUCompiler already does and will return a different object - h = hash(linked.kernel, hash(f, hash(tt))) - kernel = get(_kernel_instances, h, nothing) + job = CompilerJob(source, config) + cache = GPUCompiler.cache_view(job) + + # `@something` (not the `something` function) so `compile_opencl!` only runs + # on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs + # the full LLVM compile on every launch. + ci, res = @something lookup(cache, source) compile_opencl!(cache, job) + + # Resolve the cl.Kernel for the active context. Linear scan over the + # session-local cache; almost always n=1, so this is one `===` compare. + ctx = cl.context() + kernel = nothing + @inbounds for (cached_ctx, cached_kernel) in res.kernels + if cached_ctx === ctx + kernel = cached_kernel + break + end + end if kernel === nothing - # create the kernel state object - kernel = HostKernel{F,tt}(f, linked.kernel, linked.device_rng) - _kernel_instances[h] = kernel + kernel = link_kernel(res.obj::Vector{UInt8}, res.entry::String) + push!(res.kernels, (ctx, kernel)) end - return kernel::HostKernel{F,tt} + + h = hash(kernel, hash(f, hash(tt))) + get!(_kernel_instances, h) do + HostKernel{F,tt}(f, kernel, res.device_rng) + end::HostKernel{F,tt} end end +# Run inference and codegen for `job`, then populate the cached `OpenCLResults` with the +# session-portable artifacts. The `CodeInstance` is created during inference inside +# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a +# fresh `OpenCLResults()` attached via `@setup_caching`'s `finish!` hook. +function compile_opencl!(cache::CacheView, @nospecialize(job::CompilerJob)) + compiled = compile_to_obj(job) + ci = get(cache, job.source, nothing)::Core.CodeInstance + res = results(cache, ci)::OpenCLResults + res.obj = compiled.obj + res.entry = compiled.entry + res.device_rng = compiled.device_rng + return (ci, res) +end + # cache of kernel instances const _kernel_instances = Dict{UInt, Any}()