diff --git a/src/driver.jl b/src/driver.jl index 4f94b0bb..53accfc9 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -295,7 +295,7 @@ const __llvm_initialized = Ref(false) # minimal optimization to convert the inttoptr/call into a direct call @dispose pb=NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) end run!(pb, ir, llvm_machine(job.config.target)) end @@ -386,7 +386,7 @@ const __llvm_initialized = Ref(false) if has_deferred_jobs @dispose pb=NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) end add!(pb, AlwaysInlinerPass()) add!(pb, NewPMFunctionPassManager()) do fpm diff --git a/src/gcn.jl b/src/gcn.jl index e310b5c5..8cc0ef56 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -68,7 +68,7 @@ function finish_ir!( add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, InferAddressSpacesPass()) add!(fpm, SROAPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, EarlyCSEPass()) add!(fpm, SimplifyCFGPass()) end diff --git a/src/interface.jl b/src/interface.jl index 46389621..dd82630b 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -226,6 +226,18 @@ end # Has the runtime available and does not require special handling uses_julia_runtime(@nospecialize(job::CompilerJob)) = false +# Optional toggles consulted by the optimization pipeline. Override this method to return +# a `NamedTuple` with any of the following keys (defaults shown): +# +# - `instcombine::Bool = true`: when `false`, the pipeline substitutes `InstSimplifyPass` +# for `InstCombinePass`, retaining only the simplification subset of the peephole +# transforms (useful e.g. for downstream rewriters like Enzyme that get confused by +# InstCombine's more aggressive rewrites). +# +# Returning a `NamedTuple` keeps this single extension point lightweight: downstream +# users add new keys without GPUCompiler having to grow an interface method per option. +optimization_options(@nospecialize(job::CompilerJob)) = (;) + # Is it legal to run vectorization passes on this target can_vectorize(@nospecialize(job::CompilerJob)) = false diff --git a/src/metal.jl b/src/metal.jl index 77c9ff89..bd4f8e66 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -162,7 +162,7 @@ end # note that it isn't enough to remove the function attribute, because the Metal LLVM # compiler re-optimizes and will rediscover the property. to avoid this, we inline # all functions that are marked noreturn, i.e., until LLVM cannot rediscover it. -function hide_noreturn!(mod::LLVM.Module) +function hide_noreturn!(job::CompilerJob, mod::LLVM.Module) noreturn_attr = EnumAttribute("noreturn", 0) noinline_attr = EnumAttribute("noinline", 0) alwaysinline_attr = EnumAttribute("alwaysinline", 0) @@ -184,7 +184,7 @@ function hide_noreturn!(mod::LLVM.Module) add!(pb, AlwaysInlinerPass()) add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, SimplifyCFGPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) end run!(pb, mod) end @@ -215,7 +215,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, InferAddressSpacesPass()) add!(fpm, SROAPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, EarlyCSEPass()) add!(fpm, SimplifyCFGPass()) end @@ -228,7 +228,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L end # JuliaGPU/Metal.jl#113 - hide_noreturn!(mod) + hide_noreturn!(job, mod) # get rid of unreachable control flow (JuliaGPU/Metal.jl#370). # note that this currently works in tandem with the `hide_noreturn!` pass above, @@ -250,7 +250,7 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L add!(pb, AlwaysInlinerPass()) add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, SimplifyCFGPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) end run!(pb, mod) end @@ -386,7 +386,7 @@ function add_parameter_address_spaces!(@nospecialize(job::CompilerJob), mod::LLV add!(pb, SimplifyCFGPass()) add!(pb, SROAPass()) add!(pb, EarlyCSEPass()) - add!(pb, InstCombinePass()) + add!(pb, instcombine_pass(job)) run!(pb, mod) end diff --git a/src/optim.jl b/src/optim.jl index 95834f0b..8bdb110e 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -1,5 +1,16 @@ # LLVM IR optimization +# Pick the peephole pass according to `optimization_options(job).instcombine`. Defaults to +# `InstCombinePass` to match LLVM's standard pipeline; `InstSimplifyPass` is the fallback +# for back-ends that need only the simplification subset. +function instcombine_pass(@nospecialize(job::CompilerJob)) + if get(optimization_options(job), :instcombine, true) + InstCombinePass() + else + InstSimplifyPass() + end +end + function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=2) tm = llvm_machine(job.config.target) tti = llvm_targetinfo(job.config.target) @@ -99,14 +110,14 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l add!(mpm, NewPMFunctionPassManager()) do fpm if opt_level >= 2 add!(fpm, SROAPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, JumpThreadingPass()) add!(fpm, CorrelatedValuePropagationPass()) add!(fpm, ReassociatePass()) add!(fpm, EarlyCSEPass()) add!(fpm, AllocOptPass()) else - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, EarlyCSEPass()) end end @@ -157,7 +168,7 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ add!(fpm, CorrelatedValuePropagationPass()) add!(fpm, DCEPass()) add!(fpm, IRCEPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, JumpThreadingPass()) end if opt_level >= 3 @@ -181,7 +192,7 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) add!(fpm, InjectTLIMappings()) add!(fpm, LoopVectorizePass()) add!(fpm, LoopLoadEliminationPass()) - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) add!(fpm, SLPVectorizerPass()) add!(fpm, VectorCombinePass()) @@ -250,7 +261,7 @@ function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), op if opt_level >= 1 add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, InstCombinePass()) + add!(fpm, instcombine_pass(job)) add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) end end diff --git a/src/ptx.jl b/src/ptx.jl index 3bdc3d3e..66880850 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -164,7 +164,7 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), # NVPTX's target machine info enables runtime unrolling, # but Julia's pass sequence only invokes the simple unroller. add!(fpm, LoopUnrollPass(; job.config.opt_level)) - add!(fpm, InstCombinePass()) # clean-up redundancy + add!(fpm, instcombine_pass(job)) # clean-up redundancy add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm add!(lpm, LICMPass()) # the inner runtime check might be # outer loop invariant