Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,14 @@ if VERSION >= v"1.11.0-DEV.1552"
get_interpreter(@nospecialize(job::CompilerJob)) =
GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)),
token=ci_cache_token(job), inf_params=inference_params(job),
opt_params=optimization_params(job))
opt_params=optimization_params(job),
always_inline=job.config.always_inline)
else
get_interpreter(@nospecialize(job::CompilerJob)) =
GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)),
code_cache=ci_cache(job), inf_params=inference_params(job),
opt_params=optimization_params(job))
opt_params=optimization_params(job),
always_inline=job.config.always_inline)
end

# does this target support throwing Julia exceptions with jl_throw?
Expand Down
74 changes: 65 additions & 9 deletions src/jlgen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -420,21 +420,24 @@ end

inf_params::CC.InferenceParams
opt_params::CC.OptimizationParams

always_inline::Bool
end

@static if HAS_INTEGRATED_CACHE
function GPUInterpreter(world::UInt=Base.get_world_counter();
method_table_view::CC.MethodTableView,
token::Any,
inf_params::CC.InferenceParams,
opt_params::CC.OptimizationParams)
opt_params::CC.OptimizationParams,
always_inline::Bool=false)
@assert world <= Base.get_world_counter()

inf_cache = INFERENCE_CACHE_TYPE()

return GPUInterpreter(world, method_table_view,
token, inf_cache,
inf_params, opt_params)
inf_params, opt_params, always_inline)
end

function GPUInterpreter(interp::GPUInterpreter;
Expand All @@ -443,10 +446,11 @@ function GPUInterpreter(interp::GPUInterpreter;
token::Any=interp.token,
inf_cache::INFERENCE_CACHE_TYPE=interp.inf_cache,
inf_params::CC.InferenceParams=interp.inf_params,
opt_params::CC.OptimizationParams=interp.opt_params)
opt_params::CC.OptimizationParams=interp.opt_params,
always_inline::Bool=interp.always_inline)
return GPUInterpreter(world, method_table_view,
token, inf_cache,
inf_params, opt_params)
inf_params, opt_params, always_inline)
end

else
Expand All @@ -455,14 +459,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter();
method_table_view::CC.MethodTableView,
code_cache::CodeCache,
inf_params::CC.InferenceParams,
opt_params::CC.OptimizationParams)
opt_params::CC.OptimizationParams,
always_inline::Bool=false)
@assert world <= Base.get_world_counter()

inf_cache = Vector{CC.InferenceResult}()

return GPUInterpreter(world, method_table_view,
code_cache, inf_cache,
inf_params, opt_params)
inf_params, opt_params, always_inline)
end

function GPUInterpreter(interp::GPUInterpreter;
Expand All @@ -471,10 +476,11 @@ function GPUInterpreter(interp::GPUInterpreter;
code_cache::CodeCache=interp.code_cache,
inf_cache::Vector{CC.InferenceResult}=interp.inf_cache,
inf_params::CC.InferenceParams=interp.inf_params,
opt_params::CC.OptimizationParams=interp.opt_params)
opt_params::CC.OptimizationParams=interp.opt_params,
always_inline::Bool=interp.always_inline)
return GPUInterpreter(world, method_table_view,
code_cache, inf_cache,
inf_params, opt_params)
inf_params, opt_params, always_inline)
end
end # HAS_INTEGRATED_CACHE

Expand All @@ -498,7 +504,11 @@ end

CC.may_optimize(interp::GPUInterpreter) = true
CC.may_compress(interp::GPUInterpreter) = true
CC.may_discard_trees(interp::GPUInterpreter) = true
# When `always_inline=true`, preserve optimized IR for every callee: otherwise
# `transform_result_for_cache` drops sources whose `inlining_cost` saturated to
# `MAX_INLINE_COST`, leaving nothing for our `src_inlining_policy` override to
# inline. See JuliaGPU/GPUCompiler.jl#527.
CC.may_discard_trees(interp::GPUInterpreter) = !interp.always_inline
@static if VERSION <= v"1.12.0-DEV.1531"
CC.verbose_stmt_info(interp::GPUInterpreter) = false
end
Expand All @@ -524,6 +534,52 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter,
return ret
end

# Force inlining of all functions with source code when `always_inline=true`.
#
# Julia's inliner stores per-function inlining cost in a fixed-width integer
# field on CodeInfo, then sets `is_inlineable(src) := inlining_cost != MAX_INLINE_COST`.
# When the body cost exceeds the storage's representable range it saturates to
# MAX_INLINE_COST and the function becomes permanently non-inlineable, regardless
# of the caller's `inline_cost_threshold`. The storage is UInt16 on 1.11/1.12
# (cap ≈65535) and was narrowed to UInt8 on 1.13+ (cap ≈5000 via
# jl_encode_inlining_cost), at which point reasonably-sized GPU kernel callees
# routinely saturate. See JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599.
#
# Bypassing the `is_inlineable` check here makes the inliner respect our
# `inline_cost_threshold = MAX_INLINE_COST` setting in practice. Julia 1.12+
# split the legacy `inlining_policy` (returns src or nothing) into
# `src_inlining_policy` (returns Bool); we override the version-appropriate hook.
@static if isdefined(CC, :src_inlining_policy)
function CC.src_inlining_policy(interp::GPUInterpreter,
@nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32)
if interp.always_inline
@static if isdefined(CC, :OptimizationState)
isa(src, CC.OptimizationState) && (src = src.src)
end
isa(src, CC.MaybeCompressed) && return true
isa(src, CC.IRCode) && return true
end
return @invoke CC.src_inlining_policy(interp::CC.AbstractInterpreter,
src::Any, info::CC.CallInfo, stmt_flag::UInt32)
end
else
function CC.inlining_policy(interp::GPUInterpreter,
@nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32)
if interp.always_inline
if isa(src, CC.MaybeCompressed)
CC.is_source_inferred(src) || return nothing
return src
elseif isa(src, CC.IRCode)
return src
elseif isa(src, CC.SemiConcreteResult)
return src
end
end
return @invoke CC.inlining_policy(interp::CC.AbstractInterpreter,
src::Any, info::CC.CallInfo, stmt_flag::UInt32)
end
end


## world view of the cache
@static if VERSION < v"1.14-"
Expand Down
18 changes: 9 additions & 9 deletions test/native.jl
Original file line number Diff line number Diff line change
Expand Up @@ -356,13 +356,13 @@ end
end

@testset "always_inline" begin
# XXX: broken by JuliaLang/julia#51599, see JuliaGPU/GPUCompiler.jl#527.
# yet somehow this works on 1.12?
broken = VERSION >= v"1.13-"

# The body has to be big enough that the inferred `inlining_cost` field
# saturates to `MAX_INLINE_COST`, otherwise it gets inlined trivially.
# That field is UInt16 on 1.11/1.12 and UInt8 on 1.13+. See
# JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599.
mod = @eval module $(gensym())
import ..sink
expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:100; init=:x))
expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:1600; init=:x))
function g(x)
expensive(x)
return
Expand All @@ -378,20 +378,20 @@ end
Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true)
end

@test @filecheck(begin
@test @filecheck begin
@check_not "@{{(julia|j)_expensive_[0-9]+}}"
Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true)
end) broken=broken
end

@test @filecheck begin
@check "@{{(julia|j)_expensive_[0-9]+}}"
Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true)
end

@test @filecheck(begin
@test @filecheck begin
@check_not "@{{(julia|j)_expensive_[0-9]+}}"
Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true)
end) broken=broken
end
end

@testset "function attributes" begin
Expand Down
Loading