Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1.11"
- "1.12"
2 changes: 1 addition & 1 deletion .github/workflows/Test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
strategy:
fail-fast: false
matrix:
version: ['1.10', '1.12']
version: ['1.11', '1.12']
os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-15-intel, windows-2022]
arch: [x64, arm64]
pocl: [jll, local]
Expand Down
11 changes: 8 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.10.9"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Expand All @@ -26,10 +27,11 @@ SPIRVIntrinsics = {path = "lib/intrinsics"}

[compat]
Adapt = "4"
CompilerCaching = "0.2.4"
GPUArrays = "11.2.1"
GPUCompiler = "1.7.1"
GPUCompiler = "1.10"
KernelAbstractions = "0.9.38"
LLVM = "9.1"
LLVM = "9.6"
LinearAlgebra = "1"
OpenCL_jll = "=2024.10.24"
Preferences = "1"
Expand All @@ -42,4 +44,7 @@ SPIRVIntrinsics = "0.5.7"
SPIRV_LLVM_Backend_jll = "20"
SPIRV_Tools_jll = "2025.1"
StaticArrays = "1"
julia = "1.10"
julia = "1.11"

[sources]
GPUCompiler = {url="https://github.com/JuliaGPU/GPUCompiler.jl", rev="tb/compilercaching"}
1 change: 1 addition & 0 deletions src/OpenCL.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module OpenCL

using GPUCompiler
using CompilerCaching: CacheView, lookup, results
using LLVM, LLVM.Interop
using SPIRV_LLVM_Backend_jll, SPIRV_Tools_jll
using Adapt
Expand Down
71 changes: 48 additions & 23 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,46 @@ end
const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}

"""
OpenCLResults

Cached compilation results attached to each OpenCL `CodeInstance`. Fields are populated
through the compile pipeline: `bitcode` after LLVM codegen (for runtime functions, which
GPUCompiler links into the kernel module — see `GPUCompiler.bitcode`/`bitcode!`),
`obj` (SPIR-V bytes) + `entry` + `device_rng` after main codegen, and `kernels` after
the session-local link onto an OpenCL context. The first four are session-portable
(cached through precompilation); `kernels` is session-local.

`kernels` is a small linear cache of `(cl.Context, cl.Kernel)` pairs. The cache partition
already covers everything that affects codegen via `GPUCompiler.cache_owner`, so the only
runtime-visible dimension left is the OpenCL context that owns the linked `cl.Kernel`.
A linear scan with `===` is fastest in the common case (n=1) and stays cheap for the
rare workload that bounces between a handful of contexts on the same device.
"""
mutable struct OpenCLResults
bitcode::Union{Nothing, Tuple{Bool, Vector{UInt8}}} # (opaque_pointers, bytes)
obj::Union{Nothing, Vector{UInt8}} # SPIR-V binary
entry::Union{Nothing, String}
device_rng::Bool
kernels::Vector{Tuple{cl.Context, cl.Kernel}} # session-local; linear-scanned
OpenCLResults() = new(nothing, nothing, nothing, false, Tuple{cl.Context, cl.Kernel}[])
end

function GPUCompiler.bitcode(r::OpenCLResults, opaque_pointers::Bool)
r.bitcode === nothing && return nothing
stored, bytes = r.bitcode
return stored === opaque_pointers ? bytes : nothing
end

function GPUCompiler.bitcode!(r::OpenCLResults, bytes::Vector{UInt8}, opaque_pointers::Bool)
r.bitcode = (opaque_pointers, bytes)
return nothing
end

GPUCompiler.runtime_module(::CompilerJob{<:Any,OpenCLCompilerParams}) = OpenCL

GPUCompiler.results_type(::OpenCLCompilerJob) = OpenCLResults

GPUCompiler.method_table_view(job::OpenCLCompilerJob) =
GPUCompiler.StackedMethodTable(job.world, method_table, SPIRVIntrinsics.method_table)

Expand Down Expand Up @@ -111,18 +149,7 @@ function GPUCompiler.finish_linked_module!(@nospecialize(job::OpenCLCompilerJob)
return
end

## compiler implementation (cache, configure, compile, and link)

# cache of compilation caches, per context
const _compiler_caches = Dict{cl.Context, Dict{Any, Any}}()
function compiler_cache(ctx::cl.Context)
cache = get(_compiler_caches, ctx, nothing)
if cache === nothing
cache = Dict{Any, Any}()
_compiler_caches[ctx] = cache
end
return cache
end
## compiler implementation (configure, compile, and link)

# cache of compiler configurations, per device (but additionally configurable via kwargs)
const _toolchain = Ref{Any}()
Expand Down Expand Up @@ -153,37 +180,35 @@ end
CompilerConfig(target, params; kernel, name, always_inline)
end

# compile to executable machine code
# run inference + LLVM codegen + SPIR-V emission. returns `(obj, entry, device_rng)`,
# all session-portable so they survive precompilation when stored on a cached `CodeInstance`.
const compilations = Threads.Atomic{Int}(0)
function compile(@nospecialize(job::CompilerJob))
function compile_to_obj(@nospecialize(job::CompilerJob))
compilations[] += 1

# TODO: this creates a context; cache those.
obj, meta = JuliaContext() do ctx
JuliaContext() do ctx
obj, meta = GPUCompiler.compile(:obj, job)

entry = LLVM.name(meta.entry)
device_rng = StringAttribute("julia.opencl.rng", "") in collect(function_attributes(meta.entry))

(; obj, entry, device_rng)
end
end

# link into an executable kernel
function link(@nospecialize(job::CompilerJob), compiled)
# link the SPIR-V bytes into a session-local `cl.Kernel` on the active context.
function link_kernel(obj::Vector{UInt8}, entry::String)
prog = if "cl_khr_il_program" in cl.device().extensions
cl.Program(; il=compiled.obj)
cl.Program(; il=obj)
else
error("Your device does not support SPIR-V, which is currently required for native execution.")
# XXX: kpet/spirv2clc#87, caused by KhronosGroup/SPIRV-LLVM-Translator#2029
source = mktempdir() do dir
il = joinpath(dir, "kernel.spv")
write(il, compiled.obj)
write(il, obj)
cmd = `spirv2clc $il`
read(cmd, String)
end
cl.Program(; source)
end
cl.build!(prog)
(; kernel=cl.Kernel(prog, compiled.entry), compiled.device_rng)
return cl.Kernel(prog, entry)
end
56 changes: 40 additions & 16 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,28 +180,52 @@ end
const clfunction_lock = ReentrantLock()

function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
ctx = cl.context()
dev = cl.device()

Base.@lock clfunction_lock begin
# compile the function
cache = compiler_cache(ctx)
config = compiler_config(cl.device(); kwargs...)::OpenCLCompilerConfig
source = methodinstance(F, tt)
config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig
linked = GPUCompiler.cached_compilation(cache, source, config, compile, link)

# create a callable object that captures the function instance. we don't need to think
# about world age here, as GPUCompiler already does and will return a different object
h = hash(linked.kernel, hash(f, hash(tt)))
kernel = get(_kernel_instances, h, nothing)
job = CompilerJob(source, config)
cache = GPUCompiler.cache_view(job)

# `@something` (not the `something` function) so `compile_opencl!` only runs
# on a cache miss — otherwise Julia evaluates it eagerly and silently re-runs
# the full LLVM compile on every launch.
ci, res = @something lookup(cache, source) compile_opencl!(cache, job)

# Resolve the cl.Kernel for the active context. Linear scan over the
# session-local cache; almost always n=1, so this is one `===` compare.
ctx = cl.context()
kernel = nothing
@inbounds for (cached_ctx, cached_kernel) in res.kernels
if cached_ctx === ctx
kernel = cached_kernel
break
end
end
if kernel === nothing
# create the kernel state object
kernel = HostKernel{F,tt}(f, linked.kernel, linked.device_rng)
_kernel_instances[h] = kernel
kernel = link_kernel(res.obj::Vector{UInt8}, res.entry::String)
push!(res.kernels, (ctx, kernel))
end
return kernel::HostKernel{F,tt}

h = hash(kernel, hash(f, hash(tt)))
get!(_kernel_instances, h) do
HostKernel{F,tt}(f, kernel, res.device_rng)
end::HostKernel{F,tt}
end
end

# Run inference and codegen for `job`, then populate the cached `OpenCLResults` with the
# session-portable artifacts. The `CodeInstance` is created during inference inside
# `GPUCompiler.compile` (which uses the same owner-partitioned `CacheView`), and gets a
# fresh `OpenCLResults()` attached via `@setup_caching`'s `finish!` hook.
function compile_opencl!(cache::CacheView, @nospecialize(job::CompilerJob))
compiled = compile_to_obj(job)
ci = get(cache, job.source, nothing)::Core.CodeInstance
res = results(cache, ci)::OpenCLResults
res.obj = compiled.obj
res.entry = compiled.entry
res.device_rng = compiled.device_rng
return (ci, res)
end

# cache of kernel instances
const _kernel_instances = Dict{UInt, Any}()
Loading