diff --git a/src/driver.jl b/src/driver.jl
index 4f94b0bb..53accfc9 100644
--- a/src/driver.jl
+++ b/src/driver.jl
@@ -295,7 +295,7 @@ const __llvm_initialized = Ref(false)
             # minimal optimization to convert the inttoptr/call into a direct call
             @dispose pb=NewPMPassBuilder() begin
                 add!(pb, NewPMFunctionPassManager()) do fpm
-                    add!(fpm, InstCombinePass())
+                    add!(fpm, instcombine_pass(job))
                 end
                 run!(pb, ir, llvm_machine(job.config.target))
             end
@@ -386,7 +386,7 @@ const __llvm_initialized = Ref(false)
                     if has_deferred_jobs
                         @dispose pb=NewPMPassBuilder() begin
                             add!(pb, NewPMFunctionPassManager()) do fpm
-                                add!(fpm, InstCombinePass())
+                                add!(fpm, instcombine_pass(job))
                             end
                             add!(pb, AlwaysInlinerPass())
                             add!(pb, NewPMFunctionPassManager()) do fpm
diff --git a/src/gcn.jl b/src/gcn.jl
index e310b5c5..8cc0ef56 100644
--- a/src/gcn.jl
+++ b/src/gcn.jl
@@ -68,7 +68,7 @@ function finish_ir!(
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, InferAddressSpacesPass())
                 add!(fpm, SROAPass())
-                add!(fpm, InstCombinePass())
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, SimplifyCFGPass())
             end
diff --git a/src/interface.jl b/src/interface.jl
index 46389621..dd82630b 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -226,6 +226,18 @@ end
 # Has the runtime available and does not require special handling
 uses_julia_runtime(@nospecialize(job::CompilerJob)) = false
 
+# Optional toggles consulted by the optimization pipeline. Override this method to return
+# a `NamedTuple` with any of the following keys (defaults shown):
+#
+# - `instcombine::Bool = true`: when `false`, the pipeline substitutes `InstSimplifyPass`
+#   for `InstCombinePass`, retaining only the simplification subset of the peephole
+#   transforms (useful e.g. for downstream rewriters like Enzyme that get confused by
+#   InstCombine's more aggressive rewrites).
+#
+# Returning a `NamedTuple` keeps this single extension point lightweight: downstream
+# users add new keys without GPUCompiler having to grow an interface method per option.
+optimization_options(@nospecialize(job::CompilerJob)) = (;)
+
 # Is it legal to run vectorization passes on this target
 can_vectorize(@nospecialize(job::CompilerJob)) = false
 
diff --git a/src/metal.jl b/src/metal.jl
index 77c9ff89..bd4f8e66 100644
--- a/src/metal.jl
+++ b/src/metal.jl
@@ -162,7 +162,7 @@ end
 # note that it isn't enough to remove the function attribute, because the Metal LLVM
 # compiler re-optimizes and will rediscover the property. to avoid this, we inline
 # all functions that are marked noreturn, i.e., until LLVM cannot rediscover it.
-function hide_noreturn!(mod::LLVM.Module)
+function hide_noreturn!(job::CompilerJob, mod::LLVM.Module)
     noreturn_attr = EnumAttribute("noreturn", 0)
     noinline_attr = EnumAttribute("noinline", 0)
     alwaysinline_attr = EnumAttribute("alwaysinline", 0)
@@ -184,7 +184,7 @@ function hide_noreturn!(mod::LLVM.Module)
         add!(pb, AlwaysInlinerPass())
         add!(pb, NewPMFunctionPassManager()) do fpm
             add!(fpm, SimplifyCFGPass())
-            add!(fpm, InstCombinePass())
+            add!(fpm, instcombine_pass(job))
         end
         run!(pb, mod)
     end
@@ -215,7 +215,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, InferAddressSpacesPass())
                 add!(fpm, SROAPass())
-                add!(fpm, InstCombinePass())
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, SimplifyCFGPass())
             end
@@ -228,7 +228,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
     end
 
     # JuliaGPU/Metal.jl#113
-    hide_noreturn!(mod)
+    hide_noreturn!(job, mod)
 
     # get rid of unreachable control flow (JuliaGPU/Metal.jl#370).
     # note that this currently works in tandem with the `hide_noreturn!` pass above,
@@ -250,7 +250,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
             add!(pb, AlwaysInlinerPass())
             add!(pb, NewPMFunctionPassManager()) do fpm
                 add!(fpm, SimplifyCFGPass())
-                add!(fpm, InstCombinePass())
+                add!(fpm, instcombine_pass(job))
             end
             run!(pb, mod)
         end
@@ -386,7 +386,7 @@ function add_parameter_address_spaces!(@nospecialize(job::CompilerJob), mod::LLV
         add!(pb, SimplifyCFGPass())
         add!(pb, SROAPass())
         add!(pb, EarlyCSEPass())
-        add!(pb, InstCombinePass())
+        add!(pb, instcombine_pass(job))
 
         run!(pb, mod)
     end
diff --git a/src/optim.jl b/src/optim.jl
index 95834f0b..8bdb110e 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -1,5 +1,16 @@
 # LLVM IR optimization
 
+# Pick the peephole pass according to `optimization_options(job).instcombine`. Defaults to
+# `InstCombinePass` to match LLVM's standard pipeline; `InstSimplifyPass` is the fallback
+# for back-ends that need only the simplification subset.
+function instcombine_pass(@nospecialize(job::CompilerJob))
+    if get(optimization_options(job), :instcombine, true)
+        InstCombinePass()
+    else
+        InstSimplifyPass()
+    end
+end
+
 function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=2)
     tm = llvm_machine(job.config.target)
     tti = llvm_targetinfo(job.config.target)
@@ -99,14 +110,14 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
         add!(mpm, NewPMFunctionPassManager()) do fpm
             if opt_level >= 2
                 add!(fpm, SROAPass())
-                add!(fpm, InstCombinePass())
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, JumpThreadingPass())
                 add!(fpm, CorrelatedValuePropagationPass())
                 add!(fpm, ReassociatePass())
                 add!(fpm, EarlyCSEPass())
                 add!(fpm, AllocOptPass())
             else
-                add!(fpm, InstCombinePass())
+                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
             end
         end
@@ -157,7 +168,7 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
         add!(fpm, CorrelatedValuePropagationPass())
         add!(fpm, DCEPass())
         add!(fpm, IRCEPass())
-        add!(fpm, InstCombinePass())
+        add!(fpm, instcombine_pass(job))
         add!(fpm, JumpThreadingPass())
     end
     if opt_level >= 3
@@ -181,7 +192,7 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
     add!(fpm, InjectTLIMappings())
     add!(fpm, LoopVectorizePass())
     add!(fpm, LoopLoadEliminationPass())
-    add!(fpm, InstCombinePass())
+    add!(fpm, instcombine_pass(job))
     add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
     add!(fpm, SLPVectorizerPass())
     add!(fpm, VectorCombinePass())
@@ -250,7 +261,7 @@ function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), op
 
     if opt_level >= 1
         add!(mpm, NewPMFunctionPassManager()) do fpm
-            add!(fpm, InstCombinePass())
+            add!(fpm, instcombine_pass(job))
             add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
         end
     end
diff --git a/src/ptx.jl b/src/ptx.jl
index 3bdc3d3e..66880850 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -164,7 +164,7 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
             # NVPTX's target machine info enables runtime unrolling,
             # but Julia's pass sequence only invokes the simple unroller.
             add!(fpm, LoopUnrollPass(; job.config.opt_level))
-            add!(fpm, InstCombinePass())        # clean-up redundancy
+            add!(fpm, instcombine_pass(job))        # clean-up redundancy
             add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
                 add!(lpm, LICMPass())           # the inner runtime check might be
                                                 # outer loop invariant