From 614f526d071e935b340135305376eeadb6a60e16 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 12 May 2026 12:42:30 +0200 Subject: [PATCH] Fix always_inline on Julia 1.11+ via inlining policy override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inlining decision is gated by `is_inlineable(src)`, which reads the saturating `CodeInfo.inlining_cost` field. JuliaLang/julia#51599 narrowed that field from UInt16 to UInt8 on 1.13, but it was already saturating on 1.11/1.12 for sufficiently large kernels — the existing test just had a body small enough to slip under the UInt16 cap. Override `src_inlining_policy` (1.12+) / `inlining_policy` (1.11) on `GPUInterpreter` to force-allow inlining of any available source when `always_inline=true`, and disable `may_discard_trees` in that mode so the optimized IR survives for the policy to inline. Bump the test body so it now reproduces the bug on every supported version. Mirrors the workaround already used in cuTile. Fixes JuliaGPU/GPUCompiler.jl#527. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/interface.jl | 6 ++-- src/jlgen.jl | 74 ++++++++++++++++++++++++++++++++++++++++++------ test/native.jl | 18 ++++++------ 3 files changed, 78 insertions(+), 20 deletions(-) diff --git a/src/interface.jl b/src/interface.jl index dd82630b..5ec4bfce 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -257,12 +257,14 @@ if VERSION >= v"1.11.0-DEV.1552" get_interpreter(@nospecialize(job::CompilerJob)) = GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)), token=ci_cache_token(job), inf_params=inference_params(job), - opt_params=optimization_params(job)) + opt_params=optimization_params(job), + always_inline=job.config.always_inline) else get_interpreter(@nospecialize(job::CompilerJob)) = GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)), code_cache=ci_cache(job), inf_params=inference_params(job), - opt_params=optimization_params(job)) + opt_params=optimization_params(job), + always_inline=job.config.always_inline) end # does this target support throwing Julia exceptions with jl_throw? diff --git a/src/jlgen.jl b/src/jlgen.jl index 21216d79..eaaced4d 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -420,6 +420,8 @@ end inf_params::CC.InferenceParams opt_params::CC.OptimizationParams + + always_inline::Bool end @static if HAS_INTEGRATED_CACHE @@ -427,14 +429,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter(); method_table_view::CC.MethodTableView, token::Any, inf_params::CC.InferenceParams, - opt_params::CC.OptimizationParams) + opt_params::CC.OptimizationParams, + always_inline::Bool=false) @assert world <= Base.get_world_counter() inf_cache = INFERENCE_CACHE_TYPE() return GPUInterpreter(world, method_table_view, token, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end function GPUInterpreter(interp::GPUInterpreter; @@ -443,10 +446,11 @@ function GPUInterpreter(interp::GPUInterpreter; token::Any=interp.token, inf_cache::INFERENCE_CACHE_TYPE=interp.inf_cache, inf_params::CC.InferenceParams=interp.inf_params, - opt_params::CC.OptimizationParams=interp.opt_params) + opt_params::CC.OptimizationParams=interp.opt_params, + always_inline::Bool=interp.always_inline) return GPUInterpreter(world, method_table_view, token, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end else @@ -455,14 +459,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter(); method_table_view::CC.MethodTableView, code_cache::CodeCache, inf_params::CC.InferenceParams, - opt_params::CC.OptimizationParams) + opt_params::CC.OptimizationParams, + always_inline::Bool=false) @assert world <= Base.get_world_counter() inf_cache = Vector{CC.InferenceResult}() return GPUInterpreter(world, method_table_view, code_cache, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end function GPUInterpreter(interp::GPUInterpreter; @@ -471,10 +476,11 @@ function GPUInterpreter(interp::GPUInterpreter; code_cache::CodeCache=interp.code_cache, inf_cache::Vector{CC.InferenceResult}=interp.inf_cache, inf_params::CC.InferenceParams=interp.inf_params, - opt_params::CC.OptimizationParams=interp.opt_params) + opt_params::CC.OptimizationParams=interp.opt_params, + always_inline::Bool=interp.always_inline) return GPUInterpreter(world, method_table_view, code_cache, inf_cache, - inf_params, opt_params) + inf_params, opt_params, always_inline) end end # HAS_INTEGRATED_CACHE @@ -498,7 +504,11 @@ end CC.may_optimize(interp::GPUInterpreter) = true CC.may_compress(interp::GPUInterpreter) = true -CC.may_discard_trees(interp::GPUInterpreter) = true +# When `always_inline=true`, preserve optimized IR for every callee: otherwise +# `transform_result_for_cache` drops sources whose `inlining_cost` saturated to +# `MAX_INLINE_COST`, leaving nothing for our `src_inlining_policy` override to +# inline. See JuliaGPU/GPUCompiler.jl#527. +CC.may_discard_trees(interp::GPUInterpreter) = !interp.always_inline @static if VERSION <= v"1.12.0-DEV.1531" CC.verbose_stmt_info(interp::GPUInterpreter) = false end @@ -524,6 +534,52 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter, return ret end +# Force inlining of all functions with source code when `always_inline=true`. +# +# Julia's inliner stores per-function inlining cost in a fixed-width integer +# field on CodeInfo, then sets `is_inlineable(src) := inlining_cost != MAX_INLINE_COST`. +# When the body cost exceeds the storage's representable range it saturates to +# MAX_INLINE_COST and the function becomes permanently non-inlineable, regardless +# of the caller's `inline_cost_threshold`. The storage is UInt16 on 1.11/1.12 +# (cap ≈65535) and was narrowed to UInt8 on 1.13+ (cap ≈5000 via +# jl_encode_inlining_cost), at which point reasonably-sized GPU kernel callees +# routinely saturate. See JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599. +# +# Bypassing the `is_inlineable` check here makes the inliner respect our +# `inline_cost_threshold = MAX_INLINE_COST` setting in practice. Julia 1.12+ +# split the legacy `inlining_policy` (returns src or nothing) into +# `src_inlining_policy` (returns Bool); we override the version-appropriate hook. +@static if isdefined(CC, :src_inlining_policy) + function CC.src_inlining_policy(interp::GPUInterpreter, + @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32) + if interp.always_inline + @static if isdefined(CC, :OptimizationState) + isa(src, CC.OptimizationState) && (src = src.src) + end + isa(src, CC.MaybeCompressed) && return true + isa(src, CC.IRCode) && return true + end + return @invoke CC.src_inlining_policy(interp::CC.AbstractInterpreter, + src::Any, info::CC.CallInfo, stmt_flag::UInt32) + end +else + function CC.inlining_policy(interp::GPUInterpreter, + @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32) + if interp.always_inline + if isa(src, CC.MaybeCompressed) + CC.is_source_inferred(src) || return nothing + return src + elseif isa(src, CC.IRCode) + return src + elseif isa(src, CC.SemiConcreteResult) + return src + end + end + return @invoke CC.inlining_policy(interp::CC.AbstractInterpreter, + src::Any, info::CC.CallInfo, stmt_flag::UInt32) + end +end + ## world view of the cache @static if VERSION < v"1.14-" diff --git a/test/native.jl b/test/native.jl index f97f659a..813f895f 100644 --- a/test/native.jl +++ b/test/native.jl @@ -356,13 +356,13 @@ end end @testset "always_inline" begin - # XXX: broken by JuliaLang/julia#51599, see JuliaGPU/GPUCompiler.jl#527. - # yet somehow this works on 1.12? - broken = VERSION >= v"1.13-" - + # The body has to be big enough that the inferred `inlining_cost` field + # saturates to `MAX_INLINE_COST`, otherwise it gets inlined trivially. + # That field is UInt16 on 1.11/1.12 and UInt8 on 1.13+. See + # JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599. mod = @eval module $(gensym()) import ..sink - expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:100; init=:x)) + expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:1600; init=:x)) function g(x) expensive(x) return @@ -378,20 +378,20 @@ end Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true) end - @test @filecheck(begin + @test @filecheck begin @check_not "@{{(julia|j)_expensive_[0-9]+}}" Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true) - end) broken=broken + end @test @filecheck begin @check "@{{(julia|j)_expensive_[0-9]+}}" Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true) end - @test @filecheck(begin + @test @filecheck begin @check_not "@{{(julia|j)_expensive_[0-9]+}}" Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true) - end) broken=broken + end end @testset "function attributes" begin