From 5b3f0ac4642b39d6f848f8eae3be59b0f38dd212 Mon Sep 17 00:00:00 2001
From: Billy Moses <wmoses@google.com>
Date: Wed, 6 May 2026 13:38:38 -0500
Subject: [PATCH 1/3] Turn instcombine into a flag (and use instsimplify
 otherwise)

---
 src/driver.jl       | 12 ++++++++++--
 src/gcn.jl          |  6 +++++-
 src/interface.jl    |  3 +++
 src/metal.jl        | 28 ++++++++++++++++++++++------
 src/optim.jl        | 30 +++++++++++++++++++++++++-----
 src/ptx.jl          | 11 ++++++++++-
 test/helpers/ptx.jl |  3 ++-
 test/ptx.jl         |  8 ++++++++
 8 files changed, 85 insertions(+), 16 deletions(-)

diff --git a/src/driver.jl b/src/driver.jl
index 4f94b0bb..b656ebdd 100644
--- a/src/driver.jl
+++ b/src/driver.jl
@@ -295,7 +295,11 @@ const __llvm_initialized = Ref(false)
             # minimal optimization to convert the inttoptr/call into a direct call
             @dispose pb=NewPMPassBuilder() begin
                 add!(pb, NewPMFunctionPassManager()) do fpm
-                    add!(fpm, InstCombinePass())
+                    if use_instcombine(job)
+                        add!(fpm, InstCombinePass())
+                    else
+                        add!(fpm, InstSimplifyPass())
+                    end
                 end
                 run!(pb, ir, llvm_machine(job.config.target))
             end
@@ -386,7 +390,11 @@ const __llvm_initialized = Ref(false)
                     if has_deferred_jobs
                         @dispose pb=NewPMPassBuilder() begin
                             add!(pb, NewPMFunctionPassManager()) do fpm
-                                add!(fpm, InstCombinePass())
+                                if use_instcombine(job)
+                                    add!(fpm, InstCombinePass())
+                                else
+                                    add!(fpm, InstSimplifyPass())
+                                end
                             end
                             add!(pb, AlwaysInlinerPass())
                             add!(pb, NewPMFunctionPassManager()) do fpm
diff --git a/src/gcn.jl b/src/gcn.jl
index e310b5c5..d186c3cf 100644
--- a/src/gcn.jl
+++ b/src/gcn.jl
@@ -68,7 +68,11 @@ function finish_ir!(
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, InferAddressSpacesPass())
                 add!(fpm, SROAPass())
-                add!(fpm, InstCombinePass())
+                if use_instcombine(job)
+                    add!(fpm, InstCombinePass())
+                else
+                    add!(fpm, InstSimplifyPass())
+                end
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, SimplifyCFGPass())
             end
diff --git a/src/interface.jl b/src/interface.jl
index 46389621..4e2576ef 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -226,6 +226,9 @@ end
 # Has the runtime available and does not require special handling
 uses_julia_runtime(@nospecialize(job::CompilerJob)) = false
 
+# whether to use instcombine or instsimplify
+use_instcombine(@nospecialize(job::CompilerJob)) = true
+
 # Is it legal to run vectorization passes on this target
 can_vectorize(@nospecialize(job::CompilerJob)) = false
 
diff --git a/src/metal.jl b/src/metal.jl
index 77c9ff89..b7a0c40d 100644
--- a/src/metal.jl
+++ b/src/metal.jl
@@ -162,7 +162,7 @@ end
 # note that it isn't enough to remove the function attribute, because the Metal LLVM
 # compiler re-optimizes and will rediscover the property. to avoid this, we inline
 # all functions that are marked noreturn, i.e., until LLVM cannot rediscover it.
-function hide_noreturn!(mod::LLVM.Module)
+function hide_noreturn!(job::CompilerJob, mod::LLVM.Module)
     noreturn_attr = EnumAttribute("noreturn", 0)
     noinline_attr = EnumAttribute("noinline", 0)
     alwaysinline_attr = EnumAttribute("alwaysinline", 0)
@@ -184,7 +184,11 @@ function hide_noreturn!(mod::LLVM.Module)
         add!(pb, AlwaysInlinerPass())
         add!(pb, NewPMFunctionPassManager()) do fpm
             add!(fpm, SimplifyCFGPass())
-            add!(fpm, InstCombinePass())
+            if use_instcombine(job)
+                add!(fpm, InstCombinePass())
+            else
+                add!(fpm, InstSimplifyPass())
+            end
         end
         run!(pb, mod)
     end
@@ -215,7 +219,11 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, InferAddressSpacesPass())
                 add!(fpm, SROAPass())
-                add!(fpm, InstCombinePass())
+                if use_instcombine(job)
+                    add!(fpm, InstCombinePass())
+                else
+                    add!(fpm, InstSimplifyPass())
+                end
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, SimplifyCFGPass())
             end
@@ -228,7 +236,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
     end
 
     # JuliaGPU/Metal.jl#113
-    hide_noreturn!(mod)
+    hide_noreturn!(job, mod)
 
     # get rid of unreachable control flow (JuliaGPU/Metal.jl#370).
     # note that this currently works in tandem with the `hide_noreturn!` pass above,
@@ -250,7 +258,11 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
             add!(pb, AlwaysInlinerPass())
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, SimplifyCFGPass())
-                add!(fpm, InstCombinePass())
+                if use_instcombine(job)
+                    add!(fpm, InstCombinePass())
+                else
+                    add!(fpm, InstSimplifyPass())
+                end
             end
             run!(pb, mod)
         end
@@ -386,7 +398,11 @@ function add_parameter_address_spaces!(@nospecialize(job::CompilerJob), mod::LLV
         add!(pb, SimplifyCFGPass())
         add!(pb, SROAPass())
         add!(pb, EarlyCSEPass())
-        add!(pb, InstCombinePass())
+        if use_instcombine(job)
+            add!(pb, InstCombinePass())
+        else
+            add!(pb, InstSimplifyPass())
+        end
 
         run!(pb, mod)
     end
diff --git a/src/optim.jl b/src/optim.jl
index 95834f0b..a200a353 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -99,14 +99,22 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
         add!(mpm, NewPMFunctionPassManager()) do fpm
             if opt_level >= 2
                 add!(fpm, SROAPass())
-                add!(fpm, InstCombinePass())
+                if use_instcombine(job)
+                    add!(fpm, InstCombinePass())
+                else
+                    add!(fpm, InstSimplifyPass())
+                end
                 add!(fpm, JumpThreadingPass())
                 add!(fpm, CorrelatedValuePropagationPass())
                 add!(fpm, ReassociatePass())
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, AllocOptPass())
             else
-                add!(fpm, InstCombinePass())
+                if use_instcombine(job)
+                    add!(fpm, InstCombinePass())
+                else
+                    add!(fpm, InstSimplifyPass())
+                end
                 add!(fpm, EarlyCSEPass())
             end
         end
@@ -157,7 +165,11 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
         add!(fpm, CorrelatedValuePropagationPass())
         add!(fpm, DCEPass())
         add!(fpm, IRCEPass())
-        add!(fpm, InstCombinePass())
+        if use_instcombine(job)
+            add!(fpm, InstCombinePass())
+        else
+            add!(fpm, InstSimplifyPass())
+        end
         add!(fpm, JumpThreadingPass())
     end
     if opt_level >= 3
@@ -181,7 +193,11 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
     add!(fpm, InjectTLIMappings())
     add!(fpm, LoopVectorizePass())
     add!(fpm, LoopLoadEliminationPass())
-    add!(fpm, InstCombinePass())
+    if use_instcombine(job)
+        add!(fpm, InstCombinePass())
+    else
+        add!(fpm, InstSimplifyPass())
+    end
     add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
     add!(fpm, SLPVectorizerPass())
     add!(fpm, VectorCombinePass())
@@ -250,7 +266,11 @@ function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), op
 
     if opt_level >= 1
         add!(mpm, NewPMFunctionPassManager()) do fpm
-            add!(fpm, InstCombinePass())
+            if use_instcombine(job)
+                add!(fpm, InstCombinePass())
+            else
+                add!(fpm, InstSimplifyPass())
+            end
             add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
         end
     end
diff --git a/src/ptx.jl b/src/ptx.jl
index 3bdc3d3e..43afdb5a 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -19,12 +19,15 @@ Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget
     maxregs::Union{Nothing,Int} = nothing
 
     fastmath::Bool = Base.JLOptions().fast_math == 1
+    instcombine::Bool = true
 
     # deprecated; remove with next major version
     exitable::Union{Nothing,Bool} = nothing
     unreachable::Union{Nothing,Bool} = nothing
 end
 
+use_instcombine(@nospecialize(job::CompilerJob{PTXCompilerTarget})) = job.config.target.instcombine
+
 function Base.hash(target::PTXCompilerTarget, h::UInt)
     h = hash(target.cap, h)
     h = hash(target.ptx, h)
@@ -36,6 +39,7 @@ function Base.hash(target::PTXCompilerTarget, h::UInt)
     h = hash(target.blocks_per_sm, h)
     h = hash(target.maxregs, h)
     h = hash(target.fastmath, h)
+    h = hash(target.instcombine, h)
 
     h
 end
@@ -91,6 +95,7 @@ function Base.show(io::IO, @nospecialize(job::CompilerJob{PTXCompilerTarget}))
     job.config.target.blocks_per_sm !== nothing && print(io, ", blocks_per_sm=$(job.config.target.blocks_per_sm)")
     job.config.target.maxregs !== nothing && print(io, ", maxregs=$(job.config.target.maxregs)")
     job.config.target.fastmath && print(io, ", fast math enabled")
+    !job.config.target.instcombine && print(io, ", instcombine disabled")
 end
 
 const ptx_intrinsics = ("vprintf", "__assertfail", "malloc", "free")
@@ -164,7 +169,11 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
             # NVPTX's target machine info enables runtime unrolling,
             # but Julia's pass sequence only invokes the simple unroller.
             add!(fpm, LoopUnrollPass(; job.config.opt_level))
-            add!(fpm, InstCombinePass())        # clean-up redundancy
+            if use_instcombine(job)
+                add!(fpm, InstCombinePass())        # clean-up redundancy
+            else
+                add!(fpm, InstSimplifyPass())
+            end
             add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
                 add!(lpm, LICMPass())           # the inner runtime check might be
                                                 # outer loop invariant
diff --git a/test/helpers/ptx.jl b/test/helpers/ptx.jl
index e82416bc..e670ed0b 100644
--- a/test/helpers/ptx.jl
+++ b/test/helpers/ptx.jl
@@ -38,10 +38,11 @@ GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime
 function create_job(@nospecialize(func), @nospecialize(types);
                     minthreads=nothing, maxthreads=nothing,
                     blocks_per_sm=nothing, maxregs=nothing,
+                    instcombine=true,
                     kwargs...)
     config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
-    target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs)
+    target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs, instcombine)
     params = CompilerParams()
     config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
diff --git a/test/ptx.jl b/test/ptx.jl
index 7010917a..7a19336b 100644
--- a/test/ptx.jl
+++ b/test/ptx.jl
@@ -169,6 +169,14 @@ end
     @test occursin("call void @julia_", ir)
 end
 
+@testset "instcombine" begin
+    mod = @eval module $(gensym())
+        foobar(x) = x + 1
+    end
+    PTX.code_llvm(mod.foobar, Tuple{Int}; instcombine=false)
+    @test true
+end
+
 end
 
 ############################################################################################

From c133dce5e8e71aed8ab5d8629b814362c26c3404 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 7 May 2026 17:13:54 +0200
Subject: [PATCH 2/3] Switch to extensible interface.

---
 src/driver.jl       | 12 ++----------
 src/gcn.jl          |  6 +-----
 src/interface.jl    | 13 +++++++++++--
 src/metal.jl        | 24 ++++--------------------
 src/optim.jl        | 41 ++++++++++++++++-------------------------
 src/ptx.jl          | 11 +----------
 test/helpers/ptx.jl | 11 ++++++++---
 7 files changed, 43 insertions(+), 75 deletions(-)

diff --git a/src/driver.jl b/src/driver.jl
index b656ebdd..53accfc9 100644
--- a/src/driver.jl
+++ b/src/driver.jl
@@ -295,11 +295,7 @@ const __llvm_initialized = Ref(false)
             # minimal optimization to convert the inttoptr/call into a direct call
             @dispose pb=NewPMPassBuilder() begin
                 add!(pb, NewPMFunctionPassManager()) do fpm
-                    if use_instcombine(job)
-                        add!(fpm, InstCombinePass())
-                    else
-                        add!(fpm, InstSimplifyPass())
-                    end
+                    add!(fpm, instcombine_pass(job))
                 end
                 run!(pb, ir, llvm_machine(job.config.target))
             end
@@ -390,11 +386,7 @@ const __llvm_initialized = Ref(false)
                     if has_deferred_jobs
                         @dispose pb=NewPMPassBuilder() begin
                             add!(pb, NewPMFunctionPassManager()) do fpm
-                                if use_instcombine(job)
-                                    add!(fpm, InstCombinePass())
-                                else
-                                    add!(fpm, InstSimplifyPass())
-                                end
+                                add!(fpm, instcombine_pass(job))
                             end
                             add!(pb, AlwaysInlinerPass())
                             add!(pb, NewPMFunctionPassManager()) do fpm
diff --git a/src/gcn.jl b/src/gcn.jl
index d186c3cf..8cc0ef56 100644
--- a/src/gcn.jl
+++ b/src/gcn.jl
@@ -68,11 +68,7 @@ function finish_ir!(
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, InferAddressSpacesPass())
                 add!(fpm, SROAPass())
-                if use_instcombine(job)
-                    add!(fpm, InstCombinePass())
-                else
-                    add!(fpm, InstSimplifyPass())
-                end
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, SimplifyCFGPass())
             end
diff --git a/src/interface.jl b/src/interface.jl
index 4e2576ef..dd82630b 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -226,8 +226,17 @@ end
 # Has the runtime available and does not require special handling
 uses_julia_runtime(@nospecialize(job::CompilerJob)) = false
 
-# whether to use instcombine or instsimplify
-use_instcombine(@nospecialize(job::CompilerJob)) = true
+# Optional toggles consulted by the optimization pipeline. Override this method to return
+# a `NamedTuple` with any of the following keys (defaults shown):
+#
+# - `instcombine::Bool = true`: when `false`, the pipeline substitutes `InstSimplifyPass`
+#   for `InstCombinePass`, retaining only the simplification subset of the peephole
+#   transforms (useful e.g. for downstream rewriters like Enzyme that get confused by
+#   InstCombine's more aggressive rewrites).
+#
+# Returning a `NamedTuple` keeps this single extension point lightweight: downstream
+# users add new keys without GPUCompiler having to grow an interface method per option.
+optimization_options(@nospecialize(job::CompilerJob)) = (;)
 
 # Is it legal to run vectorization passes on this target
 can_vectorize(@nospecialize(job::CompilerJob)) = false
diff --git a/src/metal.jl b/src/metal.jl
index b7a0c40d..bd4f8e66 100644
--- a/src/metal.jl
+++ b/src/metal.jl
@@ -184,11 +184,7 @@ function hide_noreturn!(job::CompilerJob, mod::LLVM.Module)
         add!(pb, AlwaysInlinerPass())
         add!(pb, NewPMFunctionPassManager()) do fpm
             add!(fpm, SimplifyCFGPass())
-            if use_instcombine(job)
-                add!(fpm, InstCombinePass())
-            else
-                add!(fpm, InstSimplifyPass())
-            end
+            add!(fpm, instcombine_pass(job))
         end
         run!(pb, mod)
     end
@@ -219,11 +215,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, InferAddressSpacesPass())
                 add!(fpm, SROAPass())
-                if use_instcombine(job)
-                    add!(fpm, InstCombinePass())
-                else
-                    add!(fpm, InstSimplifyPass())
-                end
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, SimplifyCFGPass())
             end
@@ -258,11 +250,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
             add!(pb, AlwaysInlinerPass())
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, SimplifyCFGPass())
-                if use_instcombine(job)
-                    add!(fpm, InstCombinePass())
-                else
-                    add!(fpm, InstSimplifyPass())
-                end
+                add!(fpm, instcombine_pass(job))
             end
             run!(pb, mod)
         end
@@ -398,11 +386,7 @@ function add_parameter_address_spaces!(@nospecialize(job::CompilerJob), mod::LLV
         add!(pb, SimplifyCFGPass())
         add!(pb, SROAPass())
         add!(pb, EarlyCSEPass())
-        if use_instcombine(job)
-            add!(pb, InstCombinePass())
-        else
-            add!(pb, InstSimplifyPass())
-        end
+        add!(pb, instcombine_pass(job))
 
         run!(pb, mod)
     end
diff --git a/src/optim.jl b/src/optim.jl
index a200a353..8bdb110e 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -1,5 +1,16 @@
 # LLVM IR optimization
 
+# Pick the peephole pass according to `optimization_options(job).instcombine`. Defaults to
+# `InstCombinePass` to match LLVM's standard pipeline; `InstSimplifyPass` is the fallback
+# for back-ends that need only the simplification subset.
+function instcombine_pass(@nospecialize(job::CompilerJob))
+    if get(optimization_options(job), :instcombine, true)
+        InstCombinePass()
+    else
+        InstSimplifyPass()
+    end
+end
+
 function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=2)
     tm = llvm_machine(job.config.target)
     tti = llvm_targetinfo(job.config.target)
@@ -99,22 +110,14 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
         add!(mpm, NewPMFunctionPassManager()) do fpm
             if opt_level >= 2
                 add!(fpm, SROAPass())
-                if use_instcombine(job)
-                    add!(fpm, InstCombinePass())
-                else
-                    add!(fpm, InstSimplifyPass())
-                end
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, JumpThreadingPass())
                 add!(fpm, CorrelatedValuePropagationPass())
                 add!(fpm, ReassociatePass())
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, AllocOptPass())
             else
-                if use_instcombine(job)
-                    add!(fpm, InstCombinePass())
-                else
-                    add!(fpm, InstSimplifyPass())
-                end
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
             end
         end
@@ -165,11 +168,7 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
         add!(fpm, CorrelatedValuePropagationPass())
         add!(fpm, DCEPass())
         add!(fpm, IRCEPass())
-        if use_instcombine(job)
-            add!(fpm, InstCombinePass())
-        else
-            add!(fpm, InstSimplifyPass())
-        end
+        add!(fpm, instcombine_pass(job))
         add!(fpm, JumpThreadingPass())
     end
     if opt_level >= 3
@@ -193,11 +192,7 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
     add!(fpm, InjectTLIMappings())
     add!(fpm, LoopVectorizePass())
     add!(fpm, LoopLoadEliminationPass())
-    if use_instcombine(job)
-        add!(fpm, InstCombinePass())
-    else
-        add!(fpm, InstSimplifyPass())
-    end
+    add!(fpm, instcombine_pass(job))
     add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
     add!(fpm, SLPVectorizerPass())
     add!(fpm, VectorCombinePass())
@@ -266,11 +261,7 @@ function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), op
 
     if opt_level >= 1
         add!(mpm, NewPMFunctionPassManager()) do fpm
-            if use_instcombine(job)
-                add!(fpm, InstCombinePass())
-            else
-                add!(fpm, InstSimplifyPass())
-            end
+            add!(fpm, instcombine_pass(job))
             add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
         end
     end
diff --git a/src/ptx.jl b/src/ptx.jl
index 43afdb5a..66880850 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -19,15 +19,12 @@ Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget
     maxregs::Union{Nothing,Int} = nothing
 
     fastmath::Bool = Base.JLOptions().fast_math == 1
-    instcombine::Bool = true
 
     # deprecated; remove with next major version
     exitable::Union{Nothing,Bool} = nothing
     unreachable::Union{Nothing,Bool} = nothing
 end
 
-use_instcombine(@nospecialize(job::CompilerJob{PTXCompilerTarget})) = job.config.target.instcombine
-
 function Base.hash(target::PTXCompilerTarget, h::UInt)
     h = hash(target.cap, h)
     h = hash(target.ptx, h)
@@ -39,7 +36,6 @@ function Base.hash(target::PTXCompilerTarget, h::UInt)
     h = hash(target.blocks_per_sm, h)
     h = hash(target.maxregs, h)
     h = hash(target.fastmath, h)
-    h = hash(target.instcombine, h)
 
     h
 end
@@ -95,7 +91,6 @@ function Base.show(io::IO, @nospecialize(job::CompilerJob{PTXCompilerTarget}))
     job.config.target.blocks_per_sm !== nothing && print(io, ", blocks_per_sm=$(job.config.target.blocks_per_sm)")
     job.config.target.maxregs !== nothing && print(io, ", maxregs=$(job.config.target.maxregs)")
     job.config.target.fastmath && print(io, ", fast math enabled")
-    !job.config.target.instcombine && print(io, ", instcombine disabled")
 end
 
 const ptx_intrinsics = ("vprintf", "__assertfail", "malloc", "free")
@@ -169,11 +164,7 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
             # NVPTX's target machine info enables runtime unrolling,
             # but Julia's pass sequence only invokes the simple unroller.
             add!(fpm, LoopUnrollPass(; job.config.opt_level))
-            if use_instcombine(job)
-                add!(fpm, InstCombinePass())        # clean-up redundancy
-            else
-                add!(fpm, InstSimplifyPass())
-            end
+            add!(fpm, instcombine_pass(job))        # clean-up redundancy
             add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
                 add!(lpm, LICMPass())           # the inner runtime check might be
                                                 # outer loop invariant
diff --git a/test/helpers/ptx.jl b/test/helpers/ptx.jl
index e670ed0b..2e973b2a 100644
--- a/test/helpers/ptx.jl
+++ b/test/helpers/ptx.jl
@@ -3,10 +3,15 @@ module PTX
 using ..GPUCompiler
 import ..TestRuntime
 
-struct CompilerParams <: AbstractCompilerParams end
+Base.@kwdef struct CompilerParams <: AbstractCompilerParams
+    instcombine::Bool = true
+end
 
 PTXCompilerJob = CompilerJob{PTXCompilerTarget,CompilerParams}
 
+GPUCompiler.optimization_options(@nospecialize(job::PTXCompilerJob)) =
+    (; instcombine = job.config.params.instcombine)
+
 struct PTXKernelState
     data::Int64
 end
@@ -42,8 +47,8 @@ function create_job(@nospecialize(func), @nospecialize(types);
                     kwargs...)
     config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
-    target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs, instcombine)
-    params = CompilerParams()
+    target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs)
+    params = CompilerParams(; instcombine)
     config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end

From 7889824ace689a7633746a8bf9472495f86f8cf7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 7 May 2026 17:15:37 +0200
Subject: [PATCH 3/3] Remove test.

---
 test/helpers/ptx.jl | 10 ++--------
 test/ptx.jl         |  8 --------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/test/helpers/ptx.jl b/test/helpers/ptx.jl
index 2e973b2a..e82416bc 100644
--- a/test/helpers/ptx.jl
+++ b/test/helpers/ptx.jl
@@ -3,15 +3,10 @@ module PTX
 using ..GPUCompiler
 import ..TestRuntime
 
-Base.@kwdef struct CompilerParams <: AbstractCompilerParams
-    instcombine::Bool = true
-end
+struct CompilerParams <: AbstractCompilerParams end
 
 PTXCompilerJob = CompilerJob{PTXCompilerTarget,CompilerParams}
 
-GPUCompiler.optimization_options(@nospecialize(job::PTXCompilerJob)) =
-    (; instcombine = job.config.params.instcombine)
-
 struct PTXKernelState
     data::Int64
 end
@@ -43,12 +38,11 @@ GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime
 function create_job(@nospecialize(func), @nospecialize(types);
                     minthreads=nothing, maxthreads=nothing,
                     blocks_per_sm=nothing, maxregs=nothing,
-                    instcombine=true,
                     kwargs...)
     config_kwargs, kwargs = split_kwargs(kwargs, GPUCompiler.CONFIG_KWARGS)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = PTXCompilerTarget(; cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs)
-    params = CompilerParams(; instcombine)
+    params = CompilerParams()
     config = CompilerConfig(target, params; kernel=false, config_kwargs...)
     CompilerJob(source, config), kwargs
 end
diff --git a/test/ptx.jl b/test/ptx.jl
index 7a19336b..7010917a 100644
--- a/test/ptx.jl
+++ b/test/ptx.jl
@@ -169,14 +169,6 @@ end
     @test occursin("call void @julia_", ir)
 end
 
-@testset "instcombine" begin
-    mod = @eval module $(gensym())
-        foobar(x) = x + 1
-    end
-    PTX.code_llvm(mod.foobar, Tuple{Int}; instcombine=false)
-    @test true
-end
-
 end
 
 ############################################################################################