Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "GPUCompiler"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "1.12.0"
version = "2.0.0"
authors = ["Tim Besard <tim.besard@gmail.com>"]

[workspace]
Expand Down
42 changes: 35 additions & 7 deletions src/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -298,19 +298,19 @@ pass_by_ref(@nospecialize(job::CompilerJob)) = false
valid_function_pointer(@nospecialize(job::CompilerJob), ptr::Ptr{Cvoid}) = false

# Care is required for anything that impacts:
# - method_table
# - method_tables
# - inference_params
# - optimization_params
# By default that is just always_inline
# the cache token is compared with jl_egal
struct GPUCompilerCacheToken
target_type::Type
always_inline::Bool
method_table::Core.MethodTable
method_tables::Tuple{Vararg{Core.MethodTable}}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uuf can we use SimpleVector here?

end

ci_cache_token(@nospecialize(job::CompilerJob)) =
GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job))
GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_tables(job))

# the codeinstance cache to use -- should only be used for the constructor
if VERSION >= v"1.11.0-DEV.1552"
Expand All @@ -327,10 +327,38 @@ function ci_cache(@nospecialize(job::CompilerJob))
end
end

# the method table to use
# deprecate method_table on next-breaking release
method_table(@nospecialize(job::CompilerJob)) = GLOBAL_METHOD_TABLE
method_table_view(@nospecialize(job::CompilerJob)) = get_method_table_view(job.world, method_table(job))
"""
method_tables(job::CompilerJob) -> Tuple{Vararg{Core.MethodTable}}

The back-end's method tables, in priority order. They are stacked on top of GPUCompiler's
internal runtime-intrinsic overlay table for inference, and are used as the discriminator
component of [`ci_cache_token`](@ref).

Most back-ends only need to declare a single table:

Base.Experimental.@MethodTable(my_method_table)
GPUCompiler.method_tables(::MyCompilerJob) = (my_method_table,)

If the back-end has overlays spread across multiple `Core.MethodTable`s (e.g. one local
to the package plus one inherited from a shared intrinsics library), return them in
priority order — the first match wins.

For full control of the inference-side `Core.Compiler.MethodTableView`, override
[`method_table_view`](@ref) instead; that is an internal hook and most back-ends should
not need it.
"""
method_tables(@nospecialize(job::CompilerJob)) = ()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make this non-breaking by defining method_tables(...) = (method_table(...),)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could, but there's going to be a breaking release anyway (probably).


# Build the inference-side view of the back-end's method tables stacked on top of
# GPUCompiler's runtime-intrinsic overlay table. Back-ends generally shouldn't override
# this; override `method_tables(job)` instead.
function method_table_view(@nospecialize(job::CompilerJob))
parent = CC.OverlayMethodTable(job.world, GLOBAL_METHOD_TABLE)
Copy link
Copy Markdown
Member

@vchuravy vchuravy May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't an OverlayMethodTable,should probably be InternalMethodTable or

for mt in reverse(method_tables(job))
parent = StackedMethodTable(job.world, mt, parent)
end
return parent
end

# the inference parameters to use when constructing the GPUInterpreter
function inference_params(@nospecialize(job::CompilerJob))
Expand Down
10 changes: 5 additions & 5 deletions src/jlgen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,6 @@ end # !HAS_INTEGRATED_CACHE

## method overrides

Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)

# Implements a priority lookup for method tables, where the first match in the stack get's returned.
# An alternative to this would be to use a "Union" where we would query the parent method table and
# do a most-specific match.
Expand Down Expand Up @@ -402,8 +400,6 @@ else
maybe_cached(mtv::CC.MethodTableView) = mtv
end

get_method_table_view(world::UInt, mt::CC.MethodTable) = CC.OverlayMethodTable(world, mt)

# VERSION >= v"1.14.0-DEV.1691"
const INFERENCE_CACHE_TYPE = isdefined(CC, :InferenceCache) ? CC.InferenceCache : Vector{CC.InferenceResult}

Expand Down Expand Up @@ -493,7 +489,11 @@ CC.lock_mi_inference(interp::GPUInterpreter, mi::MethodInstance) = nothing
CC.unlock_mi_inference(interp::GPUInterpreter, mi::MethodInstance) = nothing

function CC.add_remark!(interp::GPUInterpreter, sv::CC.InferenceState, msg)
@safe_debug "Inference remark during GPU compilation of $(sv.linfo): $msg"
# NOTE: deliberately a no-op. emitting any logging here pulls all the components
# needed to evaluate the warning into the IR for the compile job, even when the
# remark never fires — those ccalls into the Julia CPU runtime then poison AOT
# compilation. See JuliaGPU/GPUCompiler.jl#749.
return nothing
end

CC.may_optimize(interp::GPUInterpreter) = true
Expand Down
5 changes: 3 additions & 2 deletions src/runtime.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@ function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=n
# using the new nonrecursive codegen to handle function lookup ourselves?
if def isa Symbol
args = [gensym() for typ in types]
@eval @inline $def($(args...)) =
ccall($("extern $llvm_name"), llvmcall, $return_type, ($(types...),), $(args...))
@eval GPUCompiler.@device_function($return_type,
@inline $def($(args...)) =
ccall($("extern $llvm_name"), llvmcall, $return_type, ($(types...),), $(args...)))
end

return
Expand Down
32 changes: 32 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -242,3 +242,35 @@ end
return inits
end
end


## device function definitions

# GPUCompiler-owned method table holding overlays for GPU runtime intrinsics. Back-end
# method tables (declared via `method_tables(job)`) are stacked on top of this by
# `method_table_view`, so back-end overrides win first and these overlays remain
# reachable underneath. This is an internal table; back-ends must not `@overlay` it.
Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)

# define a CPU-visible stub plus an overlay in GLOBAL_METHOD_TABLE that holds the real
# device body. used to keep `ccall("extern gpu_*", ...)` bodies out of the native cache
# (so that `compile=all` sysimages / juliac don't try to resolve nonexistent symbols),
# while still letting GPU compilation find the real body via the back-end's stacked
# method table.
macro device_function(rt, ex)
ex = macroexpand(__module__, ex)
def = splitdef(ex)

# replace the CPU body with a harmless constructor call returning the expected type.
# NOTE: Int64(1) (rather than 0) so that `Ptr(Int64(...))` doesn't get lowered to C_NULL.
def[:body] = quote
$rt(1)
end

return esc(quote
$(combinedef(def))

# NOTE: no `@consistent_overlay` because the CPU stub returns a fake value
Base.Experimental.@overlay($(GPUCompiler).GLOBAL_METHOD_TABLE, $ex)
end)
end
2 changes: 1 addition & 1 deletion test/helpers/native.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ module Runtime end
NativeCompilerJob = CompilerJob{NativeCompilerTarget,CompilerParams}
GPUCompiler.runtime_module(::NativeCompilerJob) = Runtime

GPUCompiler.method_table(@nospecialize(job::NativeCompilerJob)) = job.config.params.method_table
GPUCompiler.method_tables(@nospecialize(job::NativeCompilerJob)) = (job.config.params.method_table,)
GPUCompiler.can_safepoint(@nospecialize(job::NativeCompilerJob)) = job.config.params.entry_safepoint

function create_job(@nospecialize(func), @nospecialize(types);
Expand Down
34 changes: 34 additions & 0 deletions test/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,37 @@ end
# Check that we can call this function from the CPU, to support deferred codegen for Enzyme.
@test ccall("extern deferred_codegen", llvmcall, UInt, (UInt,), 3) == 3
end

@testset "@device_function macro" begin
using InteractiveUtils

# The macro should:
# 1. define a CPU-visible function that returns the expected type without
# referencing the `extern gpu_*` symbol (so AOT compilation can link),
# 2. register an overlay in GPUCompiler.GLOBAL_METHOD_TABLE that GPU compilation
# finds via the stacked method-table view.

test_mod = @eval module $(gensym("DeviceFunctionTest"))
using GPUCompiler

GPUCompiler.@device_function(Ptr{Nothing},
@inline test_device_ptr() = ccall("extern gpu_test", llvmcall, Ptr{Nothing}, ()))

GPUCompiler.@device_function(Nothing,
@inline test_device_nothing() = ccall("extern gpu_test2", llvmcall, Nothing, ()))
end

@test isdefined(test_mod, :test_device_ptr)
@test isdefined(test_mod, :test_device_nothing)

# the overlays should be findable in GLOBAL_METHOD_TABLE
mt_view = Core.Compiler.OverlayMethodTable(Base.get_world_counter(),
GPUCompiler.GLOBAL_METHOD_TABLE)
@test findsup(Tuple{typeof(test_mod.test_device_ptr)}, mt_view) !== nothing
@test findsup(Tuple{typeof(test_mod.test_device_nothing)}, mt_view) !== nothing

# the CPU stubs must not reference the extern gpu_* symbol — that's the whole point
buf = IOBuffer()
code_llvm(buf, test_mod.test_device_ptr, Tuple{}; debuginfo=:none)
@test !occursin("gpu_test", String(take!(buf)))
end
Loading