From 19dac6e16fa1d26087123de5856ea9f4df03e279 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 23 Apr 2026 15:52:07 +0200 Subject: [PATCH 1/6] Invoke NVVMReflect through callbacks. --- src/optim.jl | 46 ++++++++++++++++++++++++++++++++++++---------- src/ptx.jl | 11 ++++++++--- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/optim.jl b/src/optim.jl index 8ecf4dc0..f09723c5 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -108,7 +108,9 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob), add!(mpm, VerifierPass()) end add!(mpm, ForceFunctionAttrsPass()) - # TODO invokePipelineStartCallbacks + if LLVM.version() >= v"17" + add!(mpm, PipelineStartCallbacks(; opt_level)) + end add!(mpm, Annotation2MetadataPass()) add!(mpm, ConstantMergePass()) add!(mpm, NewPMFunctionPassManager()) do fpm @@ -122,12 +124,19 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob), add!(fpm, SROAPass()) end end - # TODO invokeEarlySimplificationCallbacks + if LLVM.version() >= v"17" + add!(mpm, PipelineEarlySimplificationCallbacks(; opt_level)) + end end function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) + if LLVM.version() >= v"17" + add!(mpm, OptimizerEarlyCallbacks(; opt_level)) + end add!(mpm, NewPMCGSCCPassManager()) do cgpm - # TODO invokeCGSCCCallbacks + if LLVM.version() >= v"17" + add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level)) + end add!(cgpm, NewPMFunctionPassManager()) do fpm add!(fpm, AllocOptPass()) add!(fpm, Float2IntPass()) @@ -149,8 +158,10 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l add!(fpm, instcombine_pass(job)) add!(fpm, EarlyCSEPass()) end + if LLVM.version() >= v"17" + add!(fpm, PeepholeCallbacks(; opt_level)) + end end - # TODO invokePeepholeCallbacks end end @@ -160,7 +171,9 @@ function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_le if opt_level >= 2 add!(lpm, LoopRotatePass()) end - # TODO invokeLateLoopOptimizationCallbacks + if LLVM.version() >= v"17" + add!(lpm, LateLoopOptimizationsCallbacks(; opt_level)) + end end if opt_level >= 2 add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm @@ -182,7 +195,9 @@ function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_le add!(lpm, LoopDeletionPass()) add!(lpm, LoopFullUnrollPass()) end - # TODO invokeLoopOptimizerEndCallbacks + if LLVM.version() >= v"17" + add!(lpm, LoopOptimizerEndCallbacks(; opt_level)) + end end end @@ -205,7 +220,9 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ end if opt_level >= 2 add!(fpm, DSEPass()) - # TODO invokePeepholeCallbacks + if LLVM.version() >= v"17" + add!(fpm, PeepholeCallbacks(; opt_level)) + end add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) add!(fpm, AllocOptPass()) add!(fpm, NewPMLoopPassManager()) do lpm @@ -214,7 +231,9 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ end add!(fpm, LoopDistributePass()) end - # TODO invokeScalarOptimizerCallbacks + if LLVM.version() >= v"17" + add!(fpm, ScalarOptimizerLateCallbacks(; opt_level)) + end end function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) @@ -225,9 +244,14 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) add!(fpm, SLPVectorizerPass()) add!(fpm, VectorCombinePass()) - # TODO invokeVectorizerCallbacks + if LLVM.version() >= v"17" + add!(fpm, VectorizerStartCallbacks(; opt_level)) + end add!(fpm, ADCEPass()) add!(fpm, LoopUnrollPass(; opt_level)) + if LLVM.version() >= v"21" + add!(fpm, VectorizerEndCallbacks(; opt_level)) + end end function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) @@ -312,7 +336,9 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) add!(fpm, DivRemPairsPass()) end end - # TODO invokeOptimizerLastCallbacks + if LLVM.version() >= v"17" + add!(mpm, OptimizerLastCallbacks(; opt_level)) + end add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, AnnotationRemarksPass()) end diff --git a/src/ptx.jl b/src/ptx.jl index 4a977371..949753a3 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -261,12 +261,17 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), tm = llvm_machine(job.config.target) # TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450) @dispose pb=NewPMPassBuilder() begin - register!(pb, NVVMReflectPass()) register!(pb, PTXRSqrtFastPass()) register!(pb, PTXFDivFastPass()) register!(pb, PTXFSqrtFastPass()) - - add!(pb, NVVMReflectPass()) + if LLVM.version() < v"17" + # Pre-17 LLVM has no way to invoke EP callbacks from the string + # API, so fall back to our own nvvm_reflect! implementation. + # LLVM 17+ picks up NVPTX's built-in NVVMReflectPass through the + # PipelineStart EP invocations woven into `buildNewPMPipeline!`. + register!(pb, NVVMReflectPass()) + add!(pb, NVVMReflectPass()) + end add!(pb, PTXRSqrtFastPass()) add!(pb, PTXFDivFastPass()) add!(pb, PTXFSqrtFastPass()) From c095147832b9295f2a5737685a99b4ff12aef687 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 23 Apr 2026 16:09:39 +0200 Subject: [PATCH 2/6] Align optimization pipeline with Julia's. Backfills a set of passes and reorderings from Julia's `pipeline.cpp` that were missing from `buildNewPMPipeline!`: - Early simplification: add InferFunctionAttrs, move DCE before SimplifyCFG, add EarlyCSE after SROA, and the GlobalOpt + Promote + InstCombine tail at O>=1. - Early optimizer: gate the CGSCC AllocOpt/Float2Int/LowerConstantIntrinsics adaptor on O>=2, reorder the function-level sequence, add AggressiveInstCombine + ConstraintElimination, switch EarlyCSE to MSSA, and close with GlobalOpt + GlobalDCE. - Loop optimizer: merge into a single MSSA-enabled LPM with LoopInstSimplify + LoopSimplifyCFG + pre-rotate LICM (no speculation) + LoopRotate + LICM + SimpleLoopUnswitch, then run SimplifyCFG + InstCombine between the two loop sub-pipelines. - Scalar optimizer: add MergedLoadStoreMotion, BDCE, ADCE, ConstraintElimination and an early VectorCombine at O>=2; add an O>=1 path mirroring Julia's lighter sequence; replace the LoopDeletion/LoopInstSimplify + LoopDistribute tail with an LICM/JuliaLICM LPM followed by SimplifyCFG + InstCombine. - Vector pipeline: prepend a LoopRotate + LoopDeletion LPM, run LoopDistribute here instead of in scalar opts, and add the post-vectorize LICM + EarlyCSE + CVP + InstCombine cleanup plus the SROA(PreserveCFG) + InstSimplify tail. --- src/optim.jl | 102 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 28 deletions(-) diff --git a/src/optim.jl b/src/optim.jl index f09723c5..b30e8bb0 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -112,16 +112,27 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob), add!(mpm, PipelineStartCallbacks(; opt_level)) end add!(mpm, Annotation2MetadataPass()) + add!(mpm, InferFunctionAttrsPass()) add!(mpm, ConstantMergePass()) add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, LowerExpectIntrinsicPass()) if opt_level >= 2 add!(fpm, PropagateJuliaAddrspacesPass()) end + # DCE must come before simplifycfg: codegen can generate unused + # statements that would otherwise alter how simplifycfg optimizes the CFG. + add!(fpm, DCEPass()) add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...)) if opt_level >= 1 - add!(fpm, DCEPass()) add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) + end + end + if opt_level >= 1 + add!(mpm, GlobalOptPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, PromotePass()) + add!(fpm, instcombine_pass(job)) end end if LLVM.version() >= v"17" @@ -137,10 +148,12 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l if LLVM.version() >= v"17" add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level)) end - add!(cgpm, NewPMFunctionPassManager()) do fpm - add!(fpm, AllocOptPass()) - add!(fpm, Float2IntPass()) - add!(fpm, LowerConstantIntrinsicsPass()) + if opt_level >= 2 + add!(cgpm, NewPMFunctionPassManager()) do fpm + add!(fpm, AllocOptPass()) + add!(fpm, Float2IntPass()) + add!(fpm, LowerConstantIntrinsicsPass()) + end end end add!(mpm, GPULowerCPUFeaturesPass()) @@ -148,50 +161,56 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l add!(mpm, NewPMFunctionPassManager()) do fpm if opt_level >= 2 add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass(; memssa=true)) add!(fpm, instcombine_pass(job)) + add!(fpm, AggressiveInstCombinePass()) add!(fpm, JumpThreadingPass()) add!(fpm, CorrelatedValuePropagationPass()) add!(fpm, ReassociatePass()) - add!(fpm, EarlyCSEPass()) + add!(fpm, ConstraintEliminationPass()) add!(fpm, AllocOptPass()) else - add!(fpm, instcombine_pass(job)) add!(fpm, EarlyCSEPass()) + add!(fpm, instcombine_pass(job)) end if LLVM.version() >= v"17" add!(fpm, PeepholeCallbacks(; opt_level)) end end end + add!(mpm, GlobalOptPass()) + add!(mpm, GlobalDCEPass()) end function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) - add!(fpm, NewPMLoopPassManager()) do lpm + add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm add!(lpm, LowerSIMDLoopPass()) if opt_level >= 2 + add!(lpm, LoopInstSimplifyPass()) + add!(lpm, LoopSimplifyCFGPass()) + # run LICM with AllowSpeculation=false before rotation to avoid + # speculating loads that rotation can hoist more precisely. + add!(lpm, LICMPass(; allowspeculation=false)) + add!(lpm, JuliaLICMPass()) add!(lpm, LoopRotatePass()) - end - if LLVM.version() >= v"17" - add!(lpm, LateLoopOptimizationsCallbacks(; opt_level)) - end - end - if opt_level >= 2 - add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm add!(lpm, LICMPass()) add!(lpm, JuliaLICMPass()) add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true)) - add!(lpm, LICMPass()) - add!(lpm, JuliaLICMPass()) + end + if LLVM.version() >= v"17" + add!(lpm, LateLoopOptimizationsCallbacks(; opt_level)) end end if opt_level >= 2 add!(fpm, IRCEPass()) end + add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...)) + add!(fpm, instcombine_pass(job)) add!(fpm, NewPMLoopPassManager()) do lpm if opt_level >= 2 - add!(lpm, LoopInstSimplifyPass()) add!(lpm, LoopIdiomRecognizePass()) add!(lpm, IndVarSimplifyPass()) + add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true)) add!(lpm, LoopDeletionPass()) add!(lpm, LoopFullUnrollPass()) end @@ -205,15 +224,27 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ if opt_level >= 2 add!(fpm, AllocOptPass()) add!(fpm, SROAPass()) - add!(fpm, InstSimplifyPass()) + add!(fpm, VectorCombinePass()) + add!(fpm, MergedLoadStoreMotionPass()) add!(fpm, GVNPass()) - add!(fpm, MemCpyOptPass()) add!(fpm, SCCPPass()) + add!(fpm, BDCEPass()) + add!(fpm, instcombine_pass(job)) add!(fpm, CorrelatedValuePropagationPass()) - add!(fpm, DCEPass()) + add!(fpm, ADCEPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, DSEPass()) add!(fpm, IRCEPass()) - add!(fpm, instcombine_pass(job)) add!(fpm, JumpThreadingPass()) + add!(fpm, ConstraintEliminationPass()) + elseif opt_level >= 1 + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + add!(fpm, BDCEPass()) + add!(fpm, instcombine_pass(job)) + add!(fpm, ADCEPass()) end if opt_level >= 3 add!(fpm, GVNPass()) @@ -225,11 +256,14 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ end add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) add!(fpm, AllocOptPass()) - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, LoopDeletionPass()) - add!(lpm, LoopInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) end - add!(fpm, LoopDistributePass()) + add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) + add!(fpm, instcombine_pass(job)) + elseif opt_level >= 1 + add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) end if LLVM.version() >= v"17" add!(fpm, ScalarOptimizerLateCallbacks(; opt_level)) @@ -237,21 +271,33 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_ end function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) + # re-rotate loops that might have been unrotated in the simplification above + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopRotatePass()) + add!(lpm, LoopDeletionPass()) + end + add!(fpm, LoopDistributePass()) add!(fpm, InjectTLIMappings()) add!(fpm, LoopVectorizePass()) add!(fpm, LoopLoadEliminationPass()) - add!(fpm, instcombine_pass(job)) add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...)) + add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm + add!(lpm, LICMPass()) + end + add!(fpm, EarlyCSEPass()) + add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, instcombine_pass(job)) add!(fpm, SLPVectorizerPass()) add!(fpm, VectorCombinePass()) if LLVM.version() >= v"17" add!(fpm, VectorizerStartCallbacks(; opt_level)) end - add!(fpm, ADCEPass()) add!(fpm, LoopUnrollPass(; opt_level)) if LLVM.version() >= v"21" add!(fpm, VectorizerEndCallbacks(; opt_level)) end + add!(fpm, SROAPass(; preserve_cfg=true)) + add!(fpm, InstSimplifyPass()) end function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) From fe620408dcdd0493a9fee000fcbfd074efc26522 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 23 Apr 2026 16:15:59 +0200 Subject: [PATCH 3/6] Close remaining gaps versus Julia's pipeline. - Add LibCallsShrinkWrapPass in the early-optimizer O>=2 function sequence (between CorrelatedValuePropagation and Reassociate), matching Julia. A no-op on modules without libc-shaped calls, but beneficial on libdevice-using PTX code. - Gate the cleanup-pipeline GVN tail on opt_level >= 2 to match Julia (was opt_level >= 1). --- src/optim.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/optim.jl b/src/optim.jl index b30e8bb0..c6cb405c 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -166,6 +166,7 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l add!(fpm, AggressiveInstCombinePass()) add!(fpm, JumpThreadingPass()) add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, LibCallsShrinkWrapPass()) add!(fpm, ReassociatePass()) add!(fpm, ConstraintEliminationPass()) add!(fpm, AllocOptPass()) @@ -390,7 +391,7 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) end add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, DemoteFloat16Pass()) - if opt_level >= 1 + if opt_level >= 2 add!(fpm, GVNPass()) end end From d1fe8d68302b832b3aa736ea0c3ab8a0f5a82be1 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 23 Apr 2026 16:21:18 +0200 Subject: [PATCH 4/6] Adopt LLVM's NVVMReflect semantics. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set the `nvvm-reflect-ftz` module flag from `target.fastmath` in `finish_module!` — the same channel Clang uses for `-fcuda-flush-denormals-to-zero`, and the only `__nvvm_reflect` key that LLVM's upstream NVVMReflectPass honors besides `__CUDA_ARCH`. On LLVM 17+, that pass now runs via the PipelineStart EP callback we wire up in `buildEarlySimplificationPipeline`. Simplify the custom `nvvm_reflect!` fallback (LLVM < 17) to match upstream semantics: `__CUDA_ARCH` from the target capability, `__CUDA_FTZ` from the module flag, every other key folds to 0. The previous fastmath-derived handling of `__CUDA_PREC_DIV` / `__CUDA_PREC_SQRT` / `__CUDA_FMAD` is dropped; callers that want those behaviors should rely on LLVM fast-math flags on the FP ops themselves. --- src/ptx.jl | 58 +++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/ptx.jl b/src/ptx.jl index 949753a3..4f583d6d 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -130,6 +130,12 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) = function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), mod::LLVM.Module, entry::LLVM.Function) + # tell NVVMReflect whether to flush denormals; this mirrors what Clang does + # for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key + # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`. + flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] = + Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0))) + # emit the device capability and ptx isa version as constants in the module. this makes # it possible to 'query' these in device code, relying on LLVM to optimize the checks # away and generate static code. note that we only do so if there's actual uses of these @@ -460,9 +466,12 @@ end # Replace occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect with an integer. # -# NOTE: this is the same as LLVM's NVVMReflect pass, which we cannot use because it is -# not exported. It is meant to be added to a pass pipeline automatically, by -# calling adjustPassManager, but we don't use a PassManagerBuilder so cannot do so. +# This is a back-port of LLVM's NVVMReflectPass for LLVM < 17, where the +# built-in pass cannot be invoked via the string-API PipelineStart EP callback. +# Semantics match LLVM's: `__CUDA_ARCH` is derived from the target capability, +# `__CUDA_FTZ` is read from the `nvvm-reflect-ftz` module flag, and every other +# key folds to 0. Knobs like denormal flushing or FMAD contraction must be +# configured through module flags or LLVM fast-math flags, not here. const NVVM_REFLECT_FUNCTION = "__nvvm_reflect" function nvvm_reflect!(mod::LLVM.Module) job = current_job::CompilerJob @@ -477,6 +486,18 @@ function nvvm_reflect!(mod::LLVM.Module) reflect_typ = return_type(function_type(reflect_function)) isa(reflect_typ, LLVM.IntegerType) || error("_reflect's return type should be integer") + # pull __CUDA_FTZ from the nvvm-reflect-ftz module flag (same source LLVM uses) + ftz_val = 0 + if haskey(flags(mod), "nvvm-reflect-ftz") + flag = flags(mod)["nvvm-reflect-ftz"] + if flag isa LLVM.ConstantAsMetadata + c = LLVM.Value(flag) + if c isa ConstantInt + ftz_val = Int(convert(Int64, c)) + end + end + end + to_remove = [] for use in uses(reflect_function) call = user(use) @@ -520,31 +541,14 @@ function nvvm_reflect!(mod::LLVM.Module) chars = convert.(Ref(UInt8), collect(sym_op)) reflect_arg = String(chars[1:end-1]) - # handle possible cases - # XXX: put some of these property in the compiler job? - # and/or first set the "nvvm-reflect-*" module flag like Clang does? - fast_math = current_job.config.target.fastmath - # NOTE: we follow nvcc's --use_fast_math - reflect_val = if reflect_arg == "__CUDA_FTZ" - # single-precision denormals support - ConstantInt(reflect_typ, fast_math ? 1 : 0) - elseif reflect_arg == "__CUDA_PREC_DIV" - # single-precision floating-point division and reciprocals. - ConstantInt(reflect_typ, fast_math ? 0 : 1) - elseif reflect_arg == "__CUDA_PREC_SQRT" - # single-precision floating point square roots. - ConstantInt(reflect_typ, fast_math ? 0 : 1) - elseif reflect_arg == "__CUDA_FMAD" - # contraction of floating-point multiplies and adds/subtracts into - # floating-point multiply-add operations (FMAD, FFMA, or DFMA) - ConstantInt(reflect_typ, fast_math ? 1 : 0) - elseif reflect_arg == "__CUDA_ARCH" - ConstantInt(reflect_typ, job.config.target.cap.major*100 + job.config.target.cap.minor*10) + # match LLVM's NVVMReflectPass: unknown keys fold to 0. + reflect_val = if reflect_arg == "__CUDA_ARCH" + ConstantInt(reflect_typ, + job.config.target.cap.major*100 + job.config.target.cap.minor*10) + elseif reflect_arg == "__CUDA_FTZ" + ConstantInt(reflect_typ, ftz_val) else - @safe_error """Unrecognized format of __nvvm_reflect call: - $(string(call)) - Unknown argument $reflect_arg. Please file an issue.""" - continue + ConstantInt(reflect_typ, 0) end replace_uses!(call, reflect_val) From 4382dc7b07d63a4efb7481a450506b68adc2c8e7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 24 Apr 2026 10:32:44 +0200 Subject: [PATCH 5/6] Fix for Julia 1.10. --- src/optim.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/optim.jl b/src/optim.jl index c6cb405c..0bf2d4cd 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -297,7 +297,11 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) if LLVM.version() >= v"21" add!(fpm, VectorizerEndCallbacks(; opt_level)) end - add!(fpm, SROAPass(; preserve_cfg=true)) + if LLVM.version() >= v"16" + add!(fpm, SROAPass(; preserve_cfg=true)) + else + add!(fpm, SROAPass()) + end add!(fpm, InstSimplifyPass()) end From b9ceb1881e2524e0b8c241ae6596ea5d3bb50970 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 May 2026 17:28:21 +0200 Subject: [PATCH 6/6] Only emit FTZ global with toplevel modules. --- src/ptx.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ptx.jl b/src/ptx.jl index 4f583d6d..a3af659d 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -132,9 +132,15 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), mod::LLVM.Module, entry::LLVM.Function) # tell NVVMReflect whether to flush denormals; this mirrors what Clang does # for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key - # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`. - flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] = - Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0))) + # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`. only emit it on the + # toplevel module that runs through `optimize!`, as sub-modules (the cached + # runtime, deferred jobs) don't need it, and the cached runtime in + # particular would otherwise conflict on link if it was built with a + # different `fastmath` setting (which isn't part of `runtime_slug`). + if job.config.toplevel + flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] = + Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0))) + end # emit the device capability and ptx isa version as constants in the module. this makes # it possible to 'query' these in device code, relying on LLVM to optimize the checks