From 614f526d071e935b340135305376eeadb6a60e16 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 12 May 2026 12:42:30 +0200
Subject: [PATCH] Fix always_inline on Julia 1.11+ via inlining policy override
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The inlining decision is gated by `is_inlineable(src)`, which reads the
saturating `CodeInfo.inlining_cost` field. JuliaLang/julia#51599 narrowed
that field from UInt16 to UInt8 on 1.13, but it was already saturating on
1.11/1.12 for sufficiently large kernels — the existing test just had a
body small enough to slip under the UInt16 cap. Override
`src_inlining_policy` (1.12+) / `inlining_policy` (1.11) on
`GPUInterpreter` to force-allow inlining of any available source when
`always_inline=true`, and disable `may_discard_trees` in that mode so
the optimized IR survives for the policy to inline. Bump the test body
so it now reproduces the bug on every supported version. Mirrors the
workaround already used in cuTile.

Fixes JuliaGPU/GPUCompiler.jl#527.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/interface.jl |  6 ++--
 src/jlgen.jl     | 74 ++++++++++++++++++++++++++++++++++++++++++------
 test/native.jl   | 18 ++++++------
 3 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index dd82630b..5ec4bfce 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -257,12 +257,14 @@ if VERSION >= v"1.11.0-DEV.1552"
 get_interpreter(@nospecialize(job::CompilerJob)) =
     GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)),
                    token=ci_cache_token(job), inf_params=inference_params(job),
-                   opt_params=optimization_params(job))
+                   opt_params=optimization_params(job),
+                   always_inline=job.config.always_inline)
 else
 get_interpreter(@nospecialize(job::CompilerJob)) =
     GPUInterpreter(job.world; method_table_view=maybe_cached(method_table_view(job)),
                    code_cache=ci_cache(job), inf_params=inference_params(job),
-                   opt_params=optimization_params(job))
+                   opt_params=optimization_params(job),
+                   always_inline=job.config.always_inline)
 end
 
 # does this target support throwing Julia exceptions with jl_throw?
diff --git a/src/jlgen.jl b/src/jlgen.jl
index 21216d79..eaaced4d 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -420,6 +420,8 @@ end
 
     inf_params::CC.InferenceParams
     opt_params::CC.OptimizationParams
+
+    always_inline::Bool
 end
 
 @static if HAS_INTEGRATED_CACHE
@@ -427,14 +429,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter();
                         method_table_view::CC.MethodTableView,
                         token::Any,
                         inf_params::CC.InferenceParams,
-                        opt_params::CC.OptimizationParams)
+                        opt_params::CC.OptimizationParams,
+                        always_inline::Bool=false)
     @assert world <= Base.get_world_counter()
 
     inf_cache = INFERENCE_CACHE_TYPE()
 
     return GPUInterpreter(world, method_table_view,
                           token, inf_cache,
-                          inf_params, opt_params)
+                          inf_params, opt_params, always_inline)
 end
 
 function GPUInterpreter(interp::GPUInterpreter;
@@ -443,10 +446,11 @@ function GPUInterpreter(interp::GPUInterpreter;
                         token::Any=interp.token,
                         inf_cache::INFERENCE_CACHE_TYPE=interp.inf_cache,
                         inf_params::CC.InferenceParams=interp.inf_params,
-                        opt_params::CC.OptimizationParams=interp.opt_params)
+                        opt_params::CC.OptimizationParams=interp.opt_params,
+                        always_inline::Bool=interp.always_inline)
     return GPUInterpreter(world, method_table_view,
                           token, inf_cache,
-                          inf_params, opt_params)
+                          inf_params, opt_params, always_inline)
 end
 
 else
@@ -455,14 +459,15 @@ function GPUInterpreter(world::UInt=Base.get_world_counter();
                         method_table_view::CC.MethodTableView,
                         code_cache::CodeCache,
                         inf_params::CC.InferenceParams,
-                        opt_params::CC.OptimizationParams)
+                        opt_params::CC.OptimizationParams,
+                        always_inline::Bool=false)
     @assert world <= Base.get_world_counter()
 
     inf_cache = Vector{CC.InferenceResult}()
 
     return GPUInterpreter(world, method_table_view,
                           code_cache, inf_cache,
-                          inf_params, opt_params)
+                          inf_params, opt_params, always_inline)
 end
 
 function GPUInterpreter(interp::GPUInterpreter;
@@ -471,10 +476,11 @@ function GPUInterpreter(interp::GPUInterpreter;
                         code_cache::CodeCache=interp.code_cache,
                         inf_cache::Vector{CC.InferenceResult}=interp.inf_cache,
                         inf_params::CC.InferenceParams=interp.inf_params,
-                        opt_params::CC.OptimizationParams=interp.opt_params)
+                        opt_params::CC.OptimizationParams=interp.opt_params,
+                        always_inline::Bool=interp.always_inline)
     return GPUInterpreter(world, method_table_view,
                           code_cache, inf_cache,
-                          inf_params, opt_params)
+                          inf_params, opt_params, always_inline)
 end
 end # HAS_INTEGRATED_CACHE
 
@@ -498,7 +504,11 @@ end
 
 CC.may_optimize(interp::GPUInterpreter) = true
 CC.may_compress(interp::GPUInterpreter) = true
-CC.may_discard_trees(interp::GPUInterpreter) = true
+# When `always_inline=true`, preserve optimized IR for every callee: otherwise
+# `transform_result_for_cache` drops sources whose `inlining_cost` saturated to
+# `MAX_INLINE_COST`, leaving nothing for our `src_inlining_policy` override to
+# inline. See JuliaGPU/GPUCompiler.jl#527.
+CC.may_discard_trees(interp::GPUInterpreter) = !interp.always_inline
 @static if VERSION <= v"1.12.0-DEV.1531"
 CC.verbose_stmt_info(interp::GPUInterpreter) = false
 end
@@ -524,6 +534,52 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter,
     return ret
 end
 
+# Force inlining of all functions with source code when `always_inline=true`.
+#
+# Julia's inliner stores per-function inlining cost in a fixed-width integer
+# field on CodeInfo, then sets `is_inlineable(src) := inlining_cost != MAX_INLINE_COST`.
+# When the body cost exceeds the storage's representable range it saturates to
+# MAX_INLINE_COST and the function becomes permanently non-inlineable, regardless
+# of the caller's `inline_cost_threshold`. The storage is UInt16 on 1.11/1.12
+# (cap ≈65535) and was narrowed to UInt8 on 1.13+ (cap ≈5000 via
+# jl_encode_inlining_cost), at which point reasonably-sized GPU kernel callees
+# routinely saturate. See JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599.
+#
+# Bypassing the `is_inlineable` check here makes the inliner respect our
+# `inline_cost_threshold = MAX_INLINE_COST` setting in practice. Julia 1.12+
+# split the legacy `inlining_policy` (returns src or nothing) into
+# `src_inlining_policy` (returns Bool); we override the version-appropriate hook.
+@static if isdefined(CC, :src_inlining_policy)
+    function CC.src_inlining_policy(interp::GPUInterpreter,
+            @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32)
+        if interp.always_inline
+            @static if isdefined(CC, :OptimizationState)
+                isa(src, CC.OptimizationState) && (src = src.src)
+            end
+            isa(src, CC.MaybeCompressed) && return true
+            isa(src, CC.IRCode) && return true
+        end
+        return @invoke CC.src_inlining_policy(interp::CC.AbstractInterpreter,
+            src::Any, info::CC.CallInfo, stmt_flag::UInt32)
+    end
+else
+    function CC.inlining_policy(interp::GPUInterpreter,
+            @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32)
+        if interp.always_inline
+            if isa(src, CC.MaybeCompressed)
+                CC.is_source_inferred(src) || return nothing
+                return src
+            elseif isa(src, CC.IRCode)
+                return src
+            elseif isa(src, CC.SemiConcreteResult)
+                return src
+            end
+        end
+        return @invoke CC.inlining_policy(interp::CC.AbstractInterpreter,
+            src::Any, info::CC.CallInfo, stmt_flag::UInt32)
+    end
+end
+
 
 ## world view of the cache
 @static if VERSION < v"1.14-"
diff --git a/test/native.jl b/test/native.jl
index f97f659a..813f895f 100644
--- a/test/native.jl
+++ b/test/native.jl
@@ -356,13 +356,13 @@ end
 end
 
 @testset "always_inline" begin
-    # XXX: broken by JuliaLang/julia#51599, see JuliaGPU/GPUCompiler.jl#527.
-    #      yet somehow this works on 1.12?
-    broken = VERSION >= v"1.13-"
-
+    # The body has to be big enough that the inferred `inlining_cost` field
+    # saturates to `MAX_INLINE_COST`, otherwise it gets inlined trivially.
+    # That field is UInt16 on 1.11/1.12 and UInt8 on 1.13+. See
+    # JuliaGPU/GPUCompiler.jl#527 and JuliaLang/julia#51599.
     mod = @eval module $(gensym())
         import ..sink
-        expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:100; init=:x))
+        expensive(x) = $(foldl((e, _) -> :($sink($e) + $sink(x)), 1:1600; init=:x))
         function g(x)
             expensive(x)
             return
@@ -378,20 +378,20 @@ end
         Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true)
     end
 
-    @test @filecheck(begin
+    @test @filecheck begin
         @check_not "@{{(julia|j)_expensive_[0-9]+}}"
         Native.code_llvm(mod.g, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true)
-    end) broken=broken
+    end
 
     @test @filecheck begin
         @check "@{{(julia|j)_expensive_[0-9]+}}"
         Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true)
     end
 
-    @test @filecheck(begin
+    @test @filecheck begin
         @check_not "@{{(julia|j)_expensive_[0-9]+}}"
         Native.code_llvm(mod.h, Tuple{Int64}; dump_module=true, kernel=true, always_inline=true)
-    end) broken=broken
+    end
 end
 
 @testset "function attributes" begin