diff --git a/src/interface.jl b/src/interface.jl index dd82630b..5ec4bfce 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -257,12 +257,14 @@ if VERSION >= v"1.11.0-DEV.1552" get_interpreter(@nospecialize(job::CompilerJob)) = GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)), token=ci_cache_token(job), inf_params=inference_params(job), - opt_params=optimization_params(job)) + opt_params=optimization_params(job), + always_inline=job.config.always_inline) else get_interpreter(@nospecialize(job::CompilerJob)) = GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)), code_cache=ci_cache(job), inf_params=inference_params(job), - opt_params=optimization_params(job)) + opt_params=optimization_params(job), + always_inline=job.config.always_inline) end # does this target support throwing Julia exceptions with jl_throw? diff --git a/src/jlgen.jl b/src/jlgen.jl index 21216d79..eaaced4d 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -420,6 +420,8 @@ end inf_params::CC.InferenceParams opt_params::CC.OptimizationParams + + always_inline::Bool end @static if HAS_INTEGRATED_CACHE @@ -427,14 +429,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter(); method_table_view::CC.MethodTableView, token::Any, inf_params::CC.InferenceParams, - opt_params::CC.OptimizationParams) + opt_params::CC.OptimizationParams, + always_inline::Bool=false) @assert world <= Base.get_world_counter() inf_cache = INFERENCE_CACHE_TYPE() return GPUInterpreter(world, method_table_view, token, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end function GPUInterpreter(interp::GPUInterpreter; @@ -443,10 +446,11 @@ function GPUInterpreter(interp::GPUInterpreter; token::Any=interp.token, inf_cache::INFERENCE_CACHE_TYPE=interp.inf_cache, inf_params::CC.InferenceParams=interp.inf_params, - opt_params::CC.OptimizationParams=interp.opt_params) + opt_params::CC.OptimizationParams=interp.opt_params, + always_inline::Bool=interp.always_inline) return GPUInterpreter(world, method_table_view, token, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end else @@ -455,14 +459,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter(); method_table_view::CC.MethodTableView, code_cache::CodeCache, inf_params::CC.InferenceParams, - opt_params::CC.OptimizationParams) + opt_params::CC.OptimizationParams, + always_inline::Bool=false) @assert world <= Base.get_world_counter() inf_cache = Vector{CC.InferenceResult}() return GPUInterpreter(world, method_table_view, code_cache, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end function GPUInterpreter(interp::GPUInterpreter; @@ -471,10 +476,11 @@ function GPUInterpreter(interp::GPUInterpreter; code_cache::CodeCache=interp.code_cache, inf_cache::Vector{CC.InferenceResult}=interp.inf_cache, inf_params::CC.InferenceParams=interp.inf_params, - opt_params::CC.OptimizationParams=interp.opt_params) + opt_params::CC.OptimizationParams=interp.opt_params, + always_inline::Bool=interp.always_inline) return GPUInterpreter(world, method_table_view, code_cache, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end end # HAS_INTEGRATED_CACHE @@ -498,7 +504,11 @@ end CC.may_optimize(interp::GPUInterpreter) = true CC.may_compress(interp::GPUInterpreter) = true -CC.may_discard_trees(interp::GPUInterpreter) = true +# When `always_inline=true`, preserve optimized IR for every callee: otherwise +# `transform_result_for_cache` drops sources whose `inlining_cost` saturated to +# `MAX_INLINE_COST`, leaving nothing for our `src_inlining_policy` override to +# inline. See JuliaGPU/GPUCompiler.jl#527. +CC.may_discard_trees(interp::GPUInterpreter) = !interp.always_inline @static if VERSION <= v"1.12.0-DEV.1531" CC.verbose_stmt_info(interp::GPUInterpreter) = false end @@ -524,6 +534,52 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter, return ret end +# Force inlining of all functions with source code when `always_inline=true`. +# +# Julia's inliner stores per-function inlining cost in a fixed-width integer +# field on CodeInfo, then sets `is_inlineable(src) := inlining_cost != MAX_INLINE_COST`. +# When the body cost exceeds the storage's representable range it saturates to +# MAX_INLINE_COST and the function becomes permanently non-inlineable, regardless +# of the caller's `inline_cost_threshold`. The storage is UInt16 on 1.11/1.12 +# (cap ≈65535) and was narrowed to UInt8 on 1.13+ (cap ≈5000 via +# jl_encode_inlining_cost), at which point reasonably-sized GPU kernel callees +# routinely saturate. See JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599. +# +# Bypassing the `is_inlineable` check here makes the inliner respect our +# `inline_cost_threshold = MAX_INLINE_COST` setting in practice. Julia 1.12+ +# split the legacy `inlining_policy` (returns src or nothing) into +# `src_inlining_policy` (returns Bool); we override the version-appropriate hook. +@static if isdefined(CC, :src_inlining_policy) + function CC.src_inlining_policy(interp::GPUInterpreter, + @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32) + if interp.always_inline + @static if isdefined(CC, :OptimizationState) + isa(src, CC.OptimizationState) && (src = src.src) + end + isa(src, CC.MaybeCompressed) && return true + isa(src, CC.IRCode) && return true + end + return @invoke CC.src_inlining_policy(interp::CC.AbstractInterpreter, + src::Any, info::CC.CallInfo, stmt_flag::UInt32) + end +else + function CC.inlining_policy(interp::GPUInterpreter, + @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32) + if interp.always_inline + if isa(src, CC.MaybeCompressed) + CC.is_source_inferred(src) || return nothing + return src + elseif isa(src, CC.IRCode) + return src + elseif isa(src, CC.SemiConcreteResult) + return src + end + end + return @invoke CC.inlining_policy(interp::CC.AbstractInterpreter, + src::Any, info::CC.CallInfo, stmt_flag::UInt32) + end +end + ## world view of the cache @static if VERSION < v"1.14-" diff --git a/test/native.jl b/test/native.jl index f97f659a..813f895f 100644 --- a/test/native.jl +++ b/test/native.jl @@ -356,13 +356,13 @@ end end @testset "always_inline" begin - # XXX: broken by JuliaLang/julia#51599, see JuliaGPU/GPUCompiler.jl#527. - # yet somehow this works on 1.12? - broken = VERSION >= v"1.13-" - + # The body has to be big enough that the inferred `inlining_cost` field + # saturates to `MAX_INLINE_COST`, otherwise it gets inlined trivially. + # That field is UInt16 on 1.11/1.12 and UInt8 on 1.13+. See + # JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599. mod = @eval module $(gensym()) import ..sink - expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:100; init=:x)) + expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:1600; init=:x)) function g(x) expensive(x) return @@ -378,20 +378,20 @@ end Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true) end - @test @filecheck(begin + @test @filecheck begin @check_not "@{{(julia|j)_expensive_[0-9]+}}" Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true) - end) broken=broken + end @test @filecheck begin @check "@{{(julia|j)_expensive_[0-9]+}}" Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true) end - @test @filecheck(begin + @test @filecheck begin @check_not "@{{(julia|j)_expensive_[0-9]+}}" Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true) - end) broken=broken + end end @testset "function attributes" begin