diff --git a/Project.toml b/Project.toml index d18d00a0..311ac256 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "GPUCompiler" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "1.12.0" +version = "2.0.0" authors = ["Tim Besard "] [workspace] diff --git a/src/interface.jl b/src/interface.jl index dd82630b..acaa8936 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -298,7 +298,7 @@ pass_by_ref(@nospecialize(job::CompilerJob)) = false valid_function_pointer(@nospecialize(job::CompilerJob), ptr::Ptr{Cvoid}) = false # Care is required for anything that impacts: -# - method_table +# - method_tables # - inference_params # - optimization_params # By default that is just always_inline @@ -306,11 +306,11 @@ valid_function_pointer(@nospecialize(job::CompilerJob), ptr::Ptr{Cvoid}) = false struct GPUCompilerCacheToken target_type::Type always_inline::Bool - method_table::Core.MethodTable + method_tables::Tuple{Vararg{Core.MethodTable}} end ci_cache_token(@nospecialize(job::CompilerJob)) = - GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job)) + GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_tables(job)) # the codeinstance cache to use -- should only be used for the constructor if VERSION >= v"1.11.0-DEV.1552" @@ -327,10 +327,38 @@ function ci_cache(@nospecialize(job::CompilerJob)) end end -# the method table to use -# deprecate method_table on next-breaking release -method_table(@nospecialize(job::CompilerJob)) = GLOBAL_METHOD_TABLE -method_table_view(@nospecialize(job::CompilerJob)) = get_method_table_view(job.world, method_table(job)) +""" + method_tables(job::CompilerJob) -> Tuple{Vararg{Core.MethodTable}} + +The back-end's method tables, in priority order. They are stacked on top of GPUCompiler's +internal runtime-intrinsic overlay table for inference, and are used as the discriminator +component of [`ci_cache_token`](@ref). + +Most back-ends only need to declare a single table: + + Base.Experimental.@MethodTable(my_method_table) + GPUCompiler.method_tables(::MyCompilerJob) = (my_method_table,) + +If the back-end has overlays spread across multiple `Core.MethodTable`s (e.g. one local +to the package plus one inherited from a shared intrinsics library), return them in +priority order — the first match wins. + +For full control of the inference-side `Core.Compiler.MethodTableView`, override +[`method_table_view`](@ref) instead; that is an internal hook and most back-ends should +not need it. +""" +method_tables(@nospecialize(job::CompilerJob)) = () + +# Build the inference-side view of the back-end's method tables stacked on top of +# GPUCompiler's runtime-intrinsic overlay table. Back-ends generally shouldn't override +# this; override `method_tables(job)` instead. +function method_table_view(@nospecialize(job::CompilerJob)) + parent = CC.OverlayMethodTable(job.world, GLOBAL_METHOD_TABLE) + for mt in reverse(method_tables(job)) + parent = StackedMethodTable(job.world, mt, parent) + end + return parent +end # the inference parameters to use when constructing the GPUInterpreter function inference_params(@nospecialize(job::CompilerJob)) diff --git a/src/jlgen.jl b/src/jlgen.jl index 21216d79..af115c6b 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -295,8 +295,6 @@ end # !HAS_INTEGRATED_CACHE ## method overrides -Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE) - # Implements a priority lookup for method tables, where the first match in the stack get's returned. # An alternative to this would be to use a "Union" where we would query the parent method table and # do a most-specific match. @@ -402,8 +400,6 @@ else maybe_cached(mtv::CC.MethodTableView) = mtv end -get_method_table_view(world::UInt, mt::CC.MethodTable) = CC.OverlayMethodTable(world, mt) - # VERSION >= v"1.14.0-DEV.1691" const INFERENCE_CACHE_TYPE = isdefined(CC, :InferenceCache) ? CC.InferenceCache : Vector{CC.InferenceResult} @@ -493,7 +489,11 @@ CC.lock_mi_inference(interp::GPUInterpreter, mi::MethodInstance) = nothing CC.unlock_mi_inference(interp::GPUInterpreter, mi::MethodInstance) = nothing function CC.add_remark!(interp::GPUInterpreter, sv::CC.InferenceState, msg) - @safe_debug "Inference remark during GPU compilation of $(sv.linfo): $msg" + # NOTE: deliberately a no-op. emitting any logging here pulls all the components + # needed to evaluate the warning into the IR for the compile job, even when the + # remark never fires — those ccalls into the Julia CPU runtime then poison AOT + # compilation. See JuliaGPU/GPUCompiler.jl#749. + return nothing end CC.may_optimize(interp::GPUInterpreter) = true diff --git a/src/runtime.jl b/src/runtime.jl index 2b11d915..ea41a433 100644 --- a/src/runtime.jl +++ b/src/runtime.jl @@ -82,8 +82,9 @@ function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=n # using the new nonrecursive codegen to handle function lookup ourselves? if def isa Symbol args = [gensym() for typ in types] - @eval @inline $def($(args...)) = - ccall($("extern $llvm_name"), llvmcall, $return_type, ($(types...),), $(args...)) + @eval GPUCompiler.@device_function($return_type, + @inline $def($(args...)) = + ccall($("extern $llvm_name"), llvmcall, $return_type, ($(types...),), $(args...))) end return diff --git a/src/utils.jl b/src/utils.jl index b98ead9d..8278bd2e 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -242,3 +242,35 @@ end return inits end end + + +## device function definitions + +# GPUCompiler-owned method table holding overlays for GPU runtime intrinsics. Back-end +# method tables (declared via `method_tables(job)`) are stacked on top of this by +# `method_table_view`, so back-end overrides win first and these overlays remain +# reachable underneath. This is an internal table; back-ends must not `@overlay` it. +Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE) + +# define a CPU-visible stub plus an overlay in GLOBAL_METHOD_TABLE that holds the real +# device body. used to keep `ccall("extern gpu_*", ...)` bodies out of the native cache +# (so that `compile=all` sysimages / juliac don't try to resolve nonexistent symbols), +# while still letting GPU compilation find the real body via the back-end's stacked +# method table. +macro device_function(rt, ex) + ex = macroexpand(__module__, ex) + def = splitdef(ex) + + # replace the CPU body with a harmless constructor call returning the expected type. + # NOTE: Int64(1) (rather than 0) so that `Ptr(Int64(...))` doesn't get lowered to C_NULL. + def[:body] = quote + $rt(1) + end + + return esc(quote + $(combinedef(def)) + + # NOTE: no `@consistent_overlay` because the CPU stub returns a fake value + Base.Experimental.@overlay($(GPUCompiler).GLOBAL_METHOD_TABLE, $ex) + end) +end diff --git a/test/helpers/native.jl b/test/helpers/native.jl index 656028f4..b86e4e79 100644 --- a/test/helpers/native.jl +++ b/test/helpers/native.jl @@ -19,7 +19,7 @@ module Runtime end NativeCompilerJob = CompilerJob{NativeCompilerTarget,CompilerParams} GPUCompiler.runtime_module(::NativeCompilerJob) = Runtime -GPUCompiler.method_table(@nospecialize(job::NativeCompilerJob)) = job.config.params.method_table +GPUCompiler.method_tables(@nospecialize(job::NativeCompilerJob)) = (job.config.params.method_table,) GPUCompiler.can_safepoint(@nospecialize(job::NativeCompilerJob)) = job.config.params.entry_safepoint function create_job(@nospecialize(func), @nospecialize(types); diff --git a/test/utils.jl b/test/utils.jl index 4ce2258c..edbadc42 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -171,3 +171,37 @@ end # Check that we can call this function from the CPU, to support deferred codegen for Enzyme. @test ccall("extern deferred_codegen", llvmcall, UInt, (UInt,), 3) == 3 end + +@testset "@device_function macro" begin + using InteractiveUtils + + # The macro should: + # 1. define a CPU-visible function that returns the expected type without + # referencing the `extern gpu_*` symbol (so AOT compilation can link), + # 2. register an overlay in GPUCompiler.GLOBAL_METHOD_TABLE that GPU compilation + # finds via the stacked method-table view. + + test_mod = @eval module $(gensym("DeviceFunctionTest")) + using GPUCompiler + + GPUCompiler.@device_function(Ptr{Nothing}, + @inline test_device_ptr() = ccall("extern gpu_test", llvmcall, Ptr{Nothing}, ())) + + GPUCompiler.@device_function(Nothing, + @inline test_device_nothing() = ccall("extern gpu_test2", llvmcall, Nothing, ())) + end + + @test isdefined(test_mod, :test_device_ptr) + @test isdefined(test_mod, :test_device_nothing) + + # the overlays should be findable in GLOBAL_METHOD_TABLE + mt_view = Core.Compiler.OverlayMethodTable(Base.get_world_counter(), + GPUCompiler.GLOBAL_METHOD_TABLE) + @test findsup(Tuple{typeof(test_mod.test_device_ptr)}, mt_view) !== nothing + @test findsup(Tuple{typeof(test_mod.test_device_nothing)}, mt_view) !== nothing + + # the CPU stubs must not reference the extern gpu_* symbol — that's the whole point + buf = IOBuffer() + code_llvm(buf, test_mod.test_device_ptr, Tuple{}; debuginfo=:none) + @test !occursin("gpu_test", String(take!(buf))) +end