diff --git a/src/optim.jl b/src/optim.jl index 8ecf4dc0..0bf2d4cd 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -108,30 +108,52 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob), add!(mpm, VerifierPass()) end add!(mpm, ForceFunctionAttrsPass()) - # TODO invokePipelineStartCallbacks + if LLVM.version() >= v"17" + add!(mpm, PipelineStartCallbacks(; opt_level)) + end add!(mpm, Annotation2MetadataPass()) + add!(mpm, InferFunctionAttrsPass()) add!(mpm, ConstantMergePass()) add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, LowerExpectIntrinsicPass()) if opt_level >= 2 add!(fpm, PropagateJuliaAddrspacesPass()) end + # DCE must come before simplifycfg: codegen can generate unused + # statements that would otherwise alter how simplifycfg optimizes the CFG. + add!(fpm, DCEPass()) add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...)) if opt_level >= 1 - add!(fpm, DCEPass()) add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) end end - # TODO invokeEarlySimplificationCallbacks + if opt_level >= 1 + add!(mpm, GlobalOptPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, PromotePass()) + add!(fpm, instcombine_pass(job)) + end + end + if LLVM.version() >= v"17" + add!(mpm, PipelineEarlySimplificationCallbacks(; opt_level)) + end end function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) + if LLVM.version() >= v"17" + add!(mpm, OptimizerEarlyCallbacks(; opt_level)) + end add!(mpm, NewPMCGSCCPassManager()) do cgpm - # TODO invokeCGSCCCallbacks - add!(cgpm, NewPMFunctionPassManager()) do fpm - add!(fpm, AllocOptPass()) - add!(fpm, Float2IntPass()) - add!(fpm, LowerConstantIntrinsicsPass()) + if LLVM.version() >= v"17" + add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level)) + end + if opt_level >= 2 + add!(cgpm, NewPMFunctionPassManager()) do fpm + add!(fpm, AllocOptPass()) + add!(fpm, Float2IntPass()) + add!(fpm, LowerConstantIntrinsicsPass()) + end end end add!(mpm, GPULowerCPUFeaturesPass()) @@ -139,50 +161,63 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l add!(mpm, NewPMFunctionPassManager()) do fpm if opt_level >= 2 add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass(; memssa=true)) add!(fpm, instcombine_pass(job)) + add!(fpm, AggressiveInstCombinePass()) add!(fpm, JumpThreadingPass()) add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, LibCallsShrinkWrapPass()) add!(fpm, ReassociatePass()) - add!(fpm, EarlyCSEPass()) + add!(fpm, ConstraintEliminationPass()) add!(fpm, AllocOptPass()) else - add!(fpm, instcombine_pass(job)) add!(fpm, EarlyCSEPass()) + add!(fpm, instcombine_pass(job)) + end + if LLVM.version() >= v"17" + add!(fpm, PeepholeCallbacks(; opt_level)) end end - # TODO invokePeepholeCallbacks end + add!(mpm, GlobalOptPass()) + add!(mpm, GlobalDCEPass()) end function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) - add!(fpm, NewPMLoopPassManager()) do lpm + add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm add!(lpm, LowerSIMDLoopPass()) if opt_level >= 2 + add!(lpm, LoopInstSimplifyPass()) + add!(lpm, LoopSimplifyCFGPass()) + # run LICM with AllowSpeculation=false before rotation to avoid + # speculating loads that rotation can hoist more precisely. + add!(lpm, LICMPass(; allowspeculation=false)) + add!(lpm, JuliaLICMPass()) add!(lpm, LoopRotatePass()) - end - # TODO invokeLateLoopOptimizationCallbacks - end - if opt_level >= 2 - add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm add!(lpm, LICMPass()) add!(lpm, JuliaLICMPass()) add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true)) - add!(lpm, LICMPass()) - add!(lpm, JuliaLICMPass()) + end + if LLVM.version() >= v"17" + add!(lpm, LateLoopOptimizationsCallbacks(; opt_level)) end end if opt_level >= 2 add!(fpm, IRCEPass()) end + add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...)) + add!(fpm, instcombine_pass(job)) add!(fpm, NewPMLoopPassManager()) do lpm if opt_level >= 2 - add!(lpm, LoopInstSimplifyPass()) add!(lpm, LoopIdiomRecognizePass()) add!(lpm, IndVarSimplifyPass()) + add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true)) add!(lpm, LoopDeletionPass()) add!(lpm, LoopFullUnrollPass()) end - # TODO invokeLoopOptimizerEndCallbacks + if LLVM.version() >= v"17" + add!(lpm, LoopOptimizerEndCallbacks(; opt_level)) + end end end @@ -190,44 +225,84 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ if opt_level >= 2 add!(fpm, AllocOptPass()) add!(fpm, SROAPass()) - add!(fpm, InstSimplifyPass()) + add!(fpm, VectorCombinePass()) + add!(fpm, MergedLoadStoreMotionPass()) add!(fpm, GVNPass()) - add!(fpm, MemCpyOptPass()) add!(fpm, SCCPPass()) + add!(fpm, BDCEPass()) + add!(fpm, instcombine_pass(job)) add!(fpm, CorrelatedValuePropagationPass()) - add!(fpm, DCEPass()) + add!(fpm, ADCEPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, DSEPass()) add!(fpm, IRCEPass()) - add!(fpm, instcombine_pass(job)) add!(fpm, JumpThreadingPass()) + add!(fpm, ConstraintEliminationPass()) + elseif opt_level >= 1 + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + add!(fpm, BDCEPass()) + add!(fpm, instcombine_pass(job)) + add!(fpm, ADCEPass()) end if opt_level >= 3 add!(fpm, GVNPass()) end if opt_level >= 2 add!(fpm, DSEPass()) - # TODO invokePeepholeCallbacks + if LLVM.version() >= v"17" + add!(fpm, PeepholeCallbacks(; opt_level)) + end add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) add!(fpm, AllocOptPass()) - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, LoopDeletionPass()) - add!(lpm, LoopInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) end - add!(fpm, LoopDistributePass()) + add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) + add!(fpm, instcombine_pass(job)) + elseif opt_level >= 1 + add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) + end + if LLVM.version() >= v"17" + add!(fpm, ScalarOptimizerLateCallbacks(; opt_level)) end - # TODO invokeScalarOptimizerCallbacks end function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) + # re-rotate loops that might have been unrotated in the simplification above + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopRotatePass()) + add!(lpm, LoopDeletionPass()) + end + add!(fpm, LoopDistributePass()) add!(fpm, InjectTLIMappings()) add!(fpm, LoopVectorizePass()) add!(fpm, LoopLoadEliminationPass()) - add!(fpm, instcombine_pass(job)) add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) + add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm + add!(lpm, LICMPass()) + end + add!(fpm, EarlyCSEPass()) + add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, instcombine_pass(job)) add!(fpm, SLPVectorizerPass()) add!(fpm, VectorCombinePass()) - # TODO invokeVectorizerCallbacks - add!(fpm, ADCEPass()) + if LLVM.version() >= v"17" + add!(fpm, VectorizerStartCallbacks(; opt_level)) + end add!(fpm, LoopUnrollPass(; opt_level)) + if LLVM.version() >= v"21" + add!(fpm, VectorizerEndCallbacks(; opt_level)) + end + if LLVM.version() >= v"16" + add!(fpm, SROAPass(; preserve_cfg=true)) + else + add!(fpm, SROAPass()) + end + add!(fpm, InstSimplifyPass()) end function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) @@ -312,13 +387,15 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) add!(fpm, DivRemPairsPass()) end end - # TODO invokeOptimizerLastCallbacks + if LLVM.version() >= v"17" + add!(mpm, OptimizerLastCallbacks(; opt_level)) + end add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, AnnotationRemarksPass()) end add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, DemoteFloat16Pass()) - if opt_level >= 1 + if opt_level >= 2 add!(fpm, GVNPass()) end end diff --git a/src/ptx.jl b/src/ptx.jl index 4a977371..a3af659d 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -130,6 +130,18 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) = function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), mod::LLVM.Module, entry::LLVM.Function) + # tell NVVMReflect whether to flush denormals; this mirrors what Clang does + # for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key + # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`. only emit it on the + # toplevel module that runs through `optimize!`, as sub-modules (the cached + # runtime, deferred jobs) don't need it, and the cached runtime in + # particular would otherwise conflict on link if it was built with a + # different `fastmath` setting (which isn't part of `runtime_slug`). + if job.config.toplevel + flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] = + Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0))) + end + # emit the device capability and ptx isa version as constants in the module. this makes # it possible to 'query' these in device code, relying on LLVM to optimize the checks # away and generate static code. note that we only do so if there's actual uses of these @@ -261,12 +273,17 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), tm = llvm_machine(job.config.target) # TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450) @dispose pb=NewPMPassBuilder() begin - register!(pb, NVVMReflectPass()) register!(pb, PTXRSqrtFastPass()) register!(pb, PTXFDivFastPass()) register!(pb, PTXFSqrtFastPass()) - - add!(pb, NVVMReflectPass()) + if LLVM.version() < v"17" + # Pre-17 LLVM has no way to invoke EP callbacks from the string + # API, so fall back to our own nvvm_reflect! implementation. + # LLVM 17+ picks up NVPTX's built-in NVVMReflectPass through the + # PipelineStart EP invocations woven into `buildNewPMPipeline!`. + register!(pb, NVVMReflectPass()) + add!(pb, NVVMReflectPass()) + end add!(pb, PTXRSqrtFastPass()) add!(pb, PTXFDivFastPass()) add!(pb, PTXFSqrtFastPass()) @@ -455,9 +472,12 @@ end # Replace occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect with an integer. # -# NOTE: this is the same as LLVM's NVVMReflect pass, which we cannot use because it is -# not exported. It is meant to be added to a pass pipeline automatically, by -# calling adjustPassManager, but we don't use a PassManagerBuilder so cannot do so. +# This is a back-port of LLVM's NVVMReflectPass for LLVM < 17, where the +# built-in pass cannot be invoked via the string-API PipelineStart EP callback. +# Semantics match LLVM's: `__CUDA_ARCH` is derived from the target capability, +# `__CUDA_FTZ` is read from the `nvvm-reflect-ftz` module flag, and every other +# key folds to 0. Knobs like denormal flushing or FMAD contraction must be +# configured through module flags or LLVM fast-math flags, not here. const NVVM_REFLECT_FUNCTION = "__nvvm_reflect" function nvvm_reflect!(mod::LLVM.Module) job = current_job::CompilerJob @@ -472,6 +492,18 @@ function nvvm_reflect!(mod::LLVM.Module) reflect_typ = return_type(function_type(reflect_function)) isa(reflect_typ, LLVM.IntegerType) || error("_reflect's return type should be integer") + # pull __CUDA_FTZ from the nvvm-reflect-ftz module flag (same source LLVM uses) + ftz_val = 0 + if haskey(flags(mod), "nvvm-reflect-ftz") + flag = flags(mod)["nvvm-reflect-ftz"] + if flag isa LLVM.ConstantAsMetadata + c = LLVM.Value(flag) + if c isa ConstantInt + ftz_val = Int(convert(Int64, c)) + end + end + end + to_remove = [] for use in uses(reflect_function) call = user(use) @@ -515,31 +547,14 @@ function nvvm_reflect!(mod::LLVM.Module) chars = convert.(Ref(UInt8), collect(sym_op)) reflect_arg = String(chars[1:end-1]) - # handle possible cases - # XXX: put some of these property in the compiler job? - # and/or first set the "nvvm-reflect-*" module flag like Clang does? - fast_math = current_job.config.target.fastmath - # NOTE: we follow nvcc's --use_fast_math - reflect_val = if reflect_arg == "__CUDA_FTZ" - # single-precision denormals support - ConstantInt(reflect_typ, fast_math ? 1 : 0) - elseif reflect_arg == "__CUDA_PREC_DIV" - # single-precision floating-point division and reciprocals. - ConstantInt(reflect_typ, fast_math ? 0 : 1) - elseif reflect_arg == "__CUDA_PREC_SQRT" - # single-precision floating point square roots. - ConstantInt(reflect_typ, fast_math ? 0 : 1) - elseif reflect_arg == "__CUDA_FMAD" - # contraction of floating-point multiplies and adds/subtracts into - # floating-point multiply-add operations (FMAD, FFMA, or DFMA) - ConstantInt(reflect_typ, fast_math ? 1 : 0) - elseif reflect_arg == "__CUDA_ARCH" - ConstantInt(reflect_typ, job.config.target.cap.major*100 + job.config.target.cap.minor*10) + # match LLVM's NVVMReflectPass: unknown keys fold to 0. + reflect_val = if reflect_arg == "__CUDA_ARCH" + ConstantInt(reflect_typ, + job.config.target.cap.major*100 + job.config.target.cap.minor*10) + elseif reflect_arg == "__CUDA_FTZ" + ConstantInt(reflect_typ, ftz_val) else - @safe_error """Unrecognized format of __nvvm_reflect call: - $(string(call)) - Unknown argument $reflect_arg. Please file an issue.""" - continue + ConstantInt(reflect_typ, 0) end replace_uses!(call, reflect_val)