From 19dac6e16fa1d26087123de5856ea9f4df03e279 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 23 Apr 2026 15:52:07 +0200
Subject: [PATCH 1/6] Invoke NVVMReflect through callbacks.

---
 src/optim.jl | 46 ++++++++++++++++++++++++++++++++++++----------
 src/ptx.jl   | 11 ++++++++---
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/src/optim.jl b/src/optim.jl
index 8ecf4dc0..f09723c5 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -108,7 +108,9 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob),
         add!(mpm, VerifierPass())
     end
     add!(mpm, ForceFunctionAttrsPass())
-    # TODO invokePipelineStartCallbacks
+    if LLVM.version() >= v"17"
+        add!(mpm, PipelineStartCallbacks(; opt_level))
+    end
     add!(mpm, Annotation2MetadataPass())
     add!(mpm, ConstantMergePass())
     add!(mpm, NewPMFunctionPassManager()) do fpm
@@ -122,12 +124,19 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob),
             add!(fpm, SROAPass())
         end
     end
-    # TODO invokeEarlySimplificationCallbacks
+    if LLVM.version() >= v"17"
+        add!(mpm, PipelineEarlySimplificationCallbacks(; opt_level))
+    end
 end
 
 function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
+    if LLVM.version() >= v"17"
+        add!(mpm, OptimizerEarlyCallbacks(; opt_level))
+    end
     add!(mpm, NewPMCGSCCPassManager()) do cgpm
-        # TODO invokeCGSCCCallbacks
+        if LLVM.version() >= v"17"
+            add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level))
+        end
         add!(cgpm, NewPMFunctionPassManager()) do fpm
             add!(fpm, AllocOptPass())
             add!(fpm, Float2IntPass())
@@ -149,8 +158,10 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
                 add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
             end
+            if LLVM.version() >= v"17"
+                add!(fpm, PeepholeCallbacks(; opt_level))
+            end
         end
-        # TODO invokePeepholeCallbacks
     end
 end
 
@@ -160,7 +171,9 @@ function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_le
         if opt_level >= 2
             add!(lpm, LoopRotatePass())
         end
-        # TODO invokeLateLoopOptimizationCallbacks
+        if LLVM.version() >= v"17"
+            add!(lpm, LateLoopOptimizationsCallbacks(; opt_level))
+        end
     end
     if opt_level >= 2
         add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
@@ -182,7 +195,9 @@ function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_le
             add!(lpm, LoopDeletionPass())
             add!(lpm, LoopFullUnrollPass())
         end
-        # TODO invokeLoopOptimizerEndCallbacks
+        if LLVM.version() >= v"17"
+            add!(lpm, LoopOptimizerEndCallbacks(; opt_level))
+        end
     end
 end
 
@@ -205,7 +220,9 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
     end
     if opt_level >= 2
         add!(fpm, DSEPass())
-        # TODO invokePeepholeCallbacks
+        if LLVM.version() >= v"17"
+            add!(fpm, PeepholeCallbacks(; opt_level))
+        end
         add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
         add!(fpm, AllocOptPass())
         add!(fpm, NewPMLoopPassManager()) do lpm
@@ -214,7 +231,9 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
         end
         add!(fpm, LoopDistributePass())
     end
-    # TODO invokeScalarOptimizerCallbacks
+    if LLVM.version() >= v"17"
+        add!(fpm, ScalarOptimizerLateCallbacks(; opt_level))
+    end
 end
 
 function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
@@ -225,9 +244,14 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
     add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
     add!(fpm, SLPVectorizerPass())
     add!(fpm, VectorCombinePass())
-    # TODO invokeVectorizerCallbacks
+    if LLVM.version() >= v"17"
+        add!(fpm, VectorizerStartCallbacks(; opt_level))
+    end
     add!(fpm, ADCEPass())
     add!(fpm, LoopUnrollPass(; opt_level))
+    if LLVM.version() >= v"21"
+        add!(fpm, VectorizerEndCallbacks(; opt_level))
+    end
 end
 
 function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
@@ -312,7 +336,9 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
             add!(fpm, DivRemPairsPass())
         end
     end
-    # TODO invokeOptimizerLastCallbacks
+    if LLVM.version() >= v"17"
+        add!(mpm, OptimizerLastCallbacks(; opt_level))
+    end
     add!(mpm, NewPMFunctionPassManager()) do fpm
         add!(fpm, AnnotationRemarksPass())
     end
diff --git a/src/ptx.jl b/src/ptx.jl
index 4a977371..949753a3 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -261,12 +261,17 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
     tm = llvm_machine(job.config.target)
     # TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450)
     @dispose pb=NewPMPassBuilder() begin
-        register!(pb, NVVMReflectPass())
         register!(pb, PTXRSqrtFastPass())
         register!(pb, PTXFDivFastPass())
         register!(pb, PTXFSqrtFastPass())
-
-        add!(pb, NVVMReflectPass())
+        if LLVM.version() < v"17"
+            # Pre-17 LLVM has no way to invoke EP callbacks from the string
+            # API, so fall back to our own nvvm_reflect! implementation.
+            # LLVM 17+ picks up NVPTX's built-in NVVMReflectPass through the
+            # PipelineStart EP invocations woven into `buildNewPMPipeline!`.
+            register!(pb, NVVMReflectPass())
+            add!(pb, NVVMReflectPass())
+        end
         add!(pb, PTXRSqrtFastPass())
         add!(pb, PTXFDivFastPass())
         add!(pb, PTXFSqrtFastPass())

From c095147832b9295f2a5737685a99b4ff12aef687 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 23 Apr 2026 16:09:39 +0200
Subject: [PATCH 2/6] Align optimization pipeline with Julia's.

Backfills a set of passes and reorderings from Julia's `pipeline.cpp`
that were missing from `buildNewPMPipeline!`:

- Early simplification: add InferFunctionAttrs, move DCE before
  SimplifyCFG, add EarlyCSE after SROA, and the GlobalOpt + Promote +
  InstCombine tail at O>=1.
- Early optimizer: gate the CGSCC AllocOpt/Float2Int/LowerConstantIntrinsics
  adaptor on O>=2, reorder the function-level sequence, add
  AggressiveInstCombine + ConstraintElimination, switch EarlyCSE to MSSA,
  and close with GlobalOpt + GlobalDCE.
- Loop optimizer: merge into a single MSSA-enabled LPM with
  LoopInstSimplify + LoopSimplifyCFG + pre-rotate LICM (no speculation) +
  LoopRotate + LICM + SimpleLoopUnswitch, then run SimplifyCFG +
  InstCombine between the two loop sub-pipelines.
- Scalar optimizer: add MergedLoadStoreMotion, BDCE, ADCE,
  ConstraintElimination and an early VectorCombine at O>=2; add an O>=1
  path mirroring Julia's lighter sequence; replace the
  LoopDeletion/LoopInstSimplify + LoopDistribute tail with an
  LICM/JuliaLICM LPM followed by SimplifyCFG + InstCombine.
- Vector pipeline: prepend a LoopRotate + LoopDeletion LPM, run
  LoopDistribute here instead of in scalar opts, and add the post-vectorize
  LICM + EarlyCSE + CVP + InstCombine cleanup plus the
  SROA(PreserveCFG) + InstSimplify tail.
---
 src/optim.jl | 102 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 74 insertions(+), 28 deletions(-)

diff --git a/src/optim.jl b/src/optim.jl
index f09723c5..b30e8bb0 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -112,16 +112,27 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob),
         add!(mpm, PipelineStartCallbacks(; opt_level))
     end
     add!(mpm, Annotation2MetadataPass())
+    add!(mpm, InferFunctionAttrsPass())
     add!(mpm, ConstantMergePass())
     add!(mpm, NewPMFunctionPassManager()) do fpm
         add!(fpm, LowerExpectIntrinsicPass())
         if opt_level >= 2
             add!(fpm, PropagateJuliaAddrspacesPass())
         end
+        # DCE must come before simplifycfg: codegen can generate unused
+        # statements that would otherwise alter how simplifycfg optimizes the CFG.
+        add!(fpm, DCEPass())
         add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...))
         if opt_level >= 1
-            add!(fpm, DCEPass())
             add!(fpm, SROAPass())
+            add!(fpm, EarlyCSEPass())
+        end
+    end
+    if opt_level >= 1
+        add!(mpm, GlobalOptPass())
+        add!(mpm, NewPMFunctionPassManager()) do fpm
+            add!(fpm, PromotePass())
+            add!(fpm, instcombine_pass(job))
         end
     end
     if LLVM.version() >= v"17"
@@ -137,10 +148,12 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
         if LLVM.version() >= v"17"
             add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level))
         end
-        add!(cgpm, NewPMFunctionPassManager()) do fpm
-            add!(fpm, AllocOptPass())
-            add!(fpm, Float2IntPass())
-            add!(fpm, LowerConstantIntrinsicsPass())
+        if opt_level >= 2
+            add!(cgpm, NewPMFunctionPassManager()) do fpm
+                add!(fpm, AllocOptPass())
+                add!(fpm, Float2IntPass())
+                add!(fpm, LowerConstantIntrinsicsPass())
+            end
         end
     end
     add!(mpm, GPULowerCPUFeaturesPass())
@@ -148,50 +161,56 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
         add!(mpm, NewPMFunctionPassManager()) do fpm
             if opt_level >= 2
                 add!(fpm, SROAPass())
+                add!(fpm, EarlyCSEPass(; memssa=true))
                 add!(fpm, instcombine_pass(job))
+                add!(fpm, AggressiveInstCombinePass())
                 add!(fpm, JumpThreadingPass())
                 add!(fpm, CorrelatedValuePropagationPass())
                 add!(fpm, ReassociatePass())
-                add!(fpm, EarlyCSEPass())
+                add!(fpm, ConstraintEliminationPass())
                 add!(fpm, AllocOptPass())
             else
-                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
+                add!(fpm, instcombine_pass(job))
             end
             if LLVM.version() >= v"17"
                 add!(fpm, PeepholeCallbacks(; opt_level))
             end
         end
     end
+    add!(mpm, GlobalOptPass())
+    add!(mpm, GlobalDCEPass())
 end
 
 function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
-    add!(fpm, NewPMLoopPassManager()) do lpm
+    add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
         add!(lpm, LowerSIMDLoopPass())
         if opt_level >= 2
+            add!(lpm, LoopInstSimplifyPass())
+            add!(lpm, LoopSimplifyCFGPass())
+            # run LICM with AllowSpeculation=false before rotation to avoid
+            # speculating loads that rotation can hoist more precisely.
+            add!(lpm, LICMPass(; allowspeculation=false))
+            add!(lpm, JuliaLICMPass())
             add!(lpm, LoopRotatePass())
-        end
-        if LLVM.version() >= v"17"
-            add!(lpm, LateLoopOptimizationsCallbacks(; opt_level))
-        end
-    end
-    if opt_level >= 2
-        add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
             add!(lpm, LICMPass())
             add!(lpm, JuliaLICMPass())
             add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true))
-            add!(lpm, LICMPass())
-            add!(lpm, JuliaLICMPass())
+        end
+        if LLVM.version() >= v"17"
+            add!(lpm, LateLoopOptimizationsCallbacks(; opt_level))
         end
     end
     if opt_level >= 2
         add!(fpm, IRCEPass())
     end
+    add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...))
+    add!(fpm, instcombine_pass(job))
     add!(fpm, NewPMLoopPassManager()) do lpm
         if opt_level >= 2
-            add!(lpm, LoopInstSimplifyPass())
             add!(lpm, LoopIdiomRecognizePass())
             add!(lpm, IndVarSimplifyPass())
+            add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true))
             add!(lpm, LoopDeletionPass())
             add!(lpm, LoopFullUnrollPass())
         end
@@ -205,15 +224,27 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
     if opt_level >= 2
         add!(fpm, AllocOptPass())
         add!(fpm, SROAPass())
-        add!(fpm, InstSimplifyPass())
+        add!(fpm, VectorCombinePass())
+        add!(fpm, MergedLoadStoreMotionPass())
         add!(fpm, GVNPass())
-        add!(fpm, MemCpyOptPass())
         add!(fpm, SCCPPass())
+        add!(fpm, BDCEPass())
+        add!(fpm, instcombine_pass(job))
         add!(fpm, CorrelatedValuePropagationPass())
-        add!(fpm, DCEPass())
+        add!(fpm, ADCEPass())
+        add!(fpm, MemCpyOptPass())
+        add!(fpm, DSEPass())
         add!(fpm, IRCEPass())
-        add!(fpm, instcombine_pass(job))
         add!(fpm, JumpThreadingPass())
+        add!(fpm, ConstraintEliminationPass())
+    elseif opt_level >= 1
+        add!(fpm, AllocOptPass())
+        add!(fpm, SROAPass())
+        add!(fpm, MemCpyOptPass())
+        add!(fpm, SCCPPass())
+        add!(fpm, BDCEPass())
+        add!(fpm, instcombine_pass(job))
+        add!(fpm, ADCEPass())
     end
     if opt_level >= 3
         add!(fpm, GVNPass())
@@ -225,11 +256,14 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
         end
         add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
         add!(fpm, AllocOptPass())
-        add!(fpm, NewPMLoopPassManager()) do lpm
-            add!(lpm, LoopDeletionPass())
-            add!(lpm, LoopInstSimplifyPass())
+        add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
+            add!(lpm, LICMPass())
+            add!(lpm, JuliaLICMPass())
         end
-        add!(fpm, LoopDistributePass())
+        add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
+        add!(fpm, instcombine_pass(job))
+    elseif opt_level >= 1
+        add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
     end
     if LLVM.version() >= v"17"
         add!(fpm, ScalarOptimizerLateCallbacks(; opt_level))
@@ -237,21 +271,33 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
 end
 
 function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
+    # re-rotate loops that might have been unrotated in the simplification above
+    add!(fpm, NewPMLoopPassManager()) do lpm
+        add!(lpm, LoopRotatePass())
+        add!(lpm, LoopDeletionPass())
+    end
+    add!(fpm, LoopDistributePass())
     add!(fpm, InjectTLIMappings())
     add!(fpm, LoopVectorizePass())
     add!(fpm, LoopLoadEliminationPass())
-    add!(fpm, instcombine_pass(job))
     add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
+    add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
+        add!(lpm, LICMPass())
+    end
+    add!(fpm, EarlyCSEPass())
+    add!(fpm, CorrelatedValuePropagationPass())
+    add!(fpm, instcombine_pass(job))
     add!(fpm, SLPVectorizerPass())
     add!(fpm, VectorCombinePass())
     if LLVM.version() >= v"17"
         add!(fpm, VectorizerStartCallbacks(; opt_level))
     end
-    add!(fpm, ADCEPass())
     add!(fpm, LoopUnrollPass(; opt_level))
     if LLVM.version() >= v"21"
         add!(fpm, VectorizerEndCallbacks(; opt_level))
     end
+    add!(fpm, SROAPass(; preserve_cfg=true))
+    add!(fpm, InstSimplifyPass())
 end
 
 function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)

From fe620408dcdd0493a9fee000fcbfd074efc26522 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 23 Apr 2026 16:15:59 +0200
Subject: [PATCH 3/6] Close remaining gaps versus Julia's pipeline.

- Add LibCallsShrinkWrapPass in the early-optimizer O>=2 function
  sequence (between CorrelatedValuePropagation and Reassociate),
  matching Julia. A no-op on modules without libc-shaped calls, but
  beneficial on libdevice-using PTX code.
- Gate the cleanup-pipeline GVN tail on opt_level >= 2 to match Julia
  (was opt_level >= 1).
---
 src/optim.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/optim.jl b/src/optim.jl
index b30e8bb0..c6cb405c 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -166,6 +166,7 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
                 add!(fpm, AggressiveInstCombinePass())
                 add!(fpm, JumpThreadingPass())
                 add!(fpm, CorrelatedValuePropagationPass())
+                add!(fpm, LibCallsShrinkWrapPass())
                 add!(fpm, ReassociatePass())
                 add!(fpm, ConstraintEliminationPass())
                 add!(fpm, AllocOptPass())
@@ -390,7 +391,7 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
     end
     add!(mpm, NewPMFunctionPassManager()) do fpm
         add!(fpm, DemoteFloat16Pass())
-        if opt_level >= 1
+        if opt_level >= 2
             add!(fpm, GVNPass())
         end
     end

From d1fe8d68302b832b3aa736ea0c3ab8a0f5a82be1 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 23 Apr 2026 16:21:18 +0200
Subject: [PATCH 4/6] Adopt LLVM's NVVMReflect semantics.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set the `nvvm-reflect-ftz` module flag from `target.fastmath` in
`finish_module!` — the same channel Clang uses for
`-fcuda-flush-denormals-to-zero`, and the only `__nvvm_reflect` key that
LLVM's upstream NVVMReflectPass honors besides `__CUDA_ARCH`. On LLVM
17+, that pass now runs via the PipelineStart EP callback we wire up
in `buildEarlySimplificationPipeline`.

Simplify the custom `nvvm_reflect!` fallback (LLVM < 17) to match
upstream semantics: `__CUDA_ARCH` from the target capability, `__CUDA_FTZ`
from the module flag, every other key folds to 0. The previous
fastmath-derived handling of `__CUDA_PREC_DIV` / `__CUDA_PREC_SQRT` /
`__CUDA_FMAD` is dropped; callers that want those behaviors should rely
on LLVM fast-math flags on the FP ops themselves.
---
 src/ptx.jl | 58 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/ptx.jl b/src/ptx.jl
index 949753a3..4f583d6d 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -130,6 +130,12 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =
 
 function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
                         mod::LLVM.Module, entry::LLVM.Function)
+    # tell NVVMReflect whether to flush denormals; this mirrors what Clang does
+    # for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key
+    # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`.
+    flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] =
+        Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0)))
+
     # emit the device capability and ptx isa version as constants in the module. this makes
     # it possible to 'query' these in device code, relying on LLVM to optimize the checks
     # away and generate static code. note that we only do so if there's actual uses of these
@@ -460,9 +466,12 @@ end
 
 # Replace occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect with an integer.
 #
-# NOTE: this is the same as LLVM's NVVMReflect pass, which we cannot use because it is
-#       not exported. It is meant to be added to a pass pipeline automatically, by
-#       calling adjustPassManager, but we don't use a PassManagerBuilder so cannot do so.
+# This is a back-port of LLVM's NVVMReflectPass for LLVM < 17, where the
+# built-in pass cannot be invoked via the string-API PipelineStart EP callback.
+# Semantics match LLVM's: `__CUDA_ARCH` is derived from the target capability,
+# `__CUDA_FTZ` is read from the `nvvm-reflect-ftz` module flag, and every other
+# key folds to 0. Knobs like denormal flushing or FMAD contraction must be
+# configured through module flags or LLVM fast-math flags, not here.
 const NVVM_REFLECT_FUNCTION = "__nvvm_reflect"
 function nvvm_reflect!(mod::LLVM.Module)
     job = current_job::CompilerJob
@@ -477,6 +486,18 @@ function nvvm_reflect!(mod::LLVM.Module)
     reflect_typ = return_type(function_type(reflect_function))
     isa(reflect_typ, LLVM.IntegerType) || error("_reflect's return type should be integer")
 
+    # pull __CUDA_FTZ from the nvvm-reflect-ftz module flag (same source LLVM uses)
+    ftz_val = 0
+    if haskey(flags(mod), "nvvm-reflect-ftz")
+        flag = flags(mod)["nvvm-reflect-ftz"]
+        if flag isa LLVM.ConstantAsMetadata
+            c = LLVM.Value(flag)
+            if c isa ConstantInt
+                ftz_val = Int(convert(Int64, c))
+            end
+        end
+    end
+
     to_remove = []
     for use in uses(reflect_function)
         call = user(use)
@@ -520,31 +541,14 @@ function nvvm_reflect!(mod::LLVM.Module)
         chars = convert.(Ref(UInt8), collect(sym_op))
         reflect_arg = String(chars[1:end-1])
 
-        # handle possible cases
-        # XXX: put some of these property in the compiler job?
-        #      and/or first set the "nvvm-reflect-*" module flag like Clang does?
-        fast_math = current_job.config.target.fastmath
-        # NOTE: we follow nvcc's --use_fast_math
-        reflect_val = if reflect_arg == "__CUDA_FTZ"
-            # single-precision denormals support
-            ConstantInt(reflect_typ, fast_math ? 1 : 0)
-        elseif reflect_arg == "__CUDA_PREC_DIV"
-            # single-precision floating-point division and reciprocals.
-            ConstantInt(reflect_typ, fast_math ? 0 : 1)
-        elseif reflect_arg == "__CUDA_PREC_SQRT"
-            # single-precision floating point square roots.
-            ConstantInt(reflect_typ, fast_math ? 0 : 1)
-        elseif reflect_arg == "__CUDA_FMAD"
-            # contraction of floating-point multiplies and adds/subtracts into
-            # floating-point multiply-add operations (FMAD, FFMA, or DFMA)
-            ConstantInt(reflect_typ, fast_math ? 1 : 0)
-        elseif reflect_arg == "__CUDA_ARCH"
-            ConstantInt(reflect_typ, job.config.target.cap.major*100 + job.config.target.cap.minor*10)
+        # match LLVM's NVVMReflectPass: unknown keys fold to 0.
+        reflect_val = if reflect_arg == "__CUDA_ARCH"
+            ConstantInt(reflect_typ,
+                        job.config.target.cap.major*100 + job.config.target.cap.minor*10)
+        elseif reflect_arg == "__CUDA_FTZ"
+            ConstantInt(reflect_typ, ftz_val)
         else
-            @safe_error """Unrecognized format of __nvvm_reflect call:
-                           $(string(call))
-                           Unknown argument $reflect_arg. Please file an issue."""
-            continue
+            ConstantInt(reflect_typ, 0)
         end
 
         replace_uses!(call, reflect_val)

From 4382dc7b07d63a4efb7481a450506b68adc2c8e7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 24 Apr 2026 10:32:44 +0200
Subject: [PATCH 5/6] Fix for Julia 1.10.

---
 src/optim.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/optim.jl b/src/optim.jl
index c6cb405c..0bf2d4cd 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -297,7 +297,11 @@ function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
     if LLVM.version() >= v"21"
         add!(fpm, VectorizerEndCallbacks(; opt_level))
     end
-    add!(fpm, SROAPass(; preserve_cfg=true))
+    if LLVM.version() >= v"16"
+        add!(fpm, SROAPass(; preserve_cfg=true))
+    else
+        add!(fpm, SROAPass())
+    end
     add!(fpm, InstSimplifyPass())
 end
 

From b9ceb1881e2524e0b8c241ae6596ea5d3bb50970 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 May 2026 17:28:21 +0200
Subject: [PATCH 6/6] Only emit FTZ global with toplevel modules.

---
 src/ptx.jl | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/ptx.jl b/src/ptx.jl
index 4f583d6d..a3af659d 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -132,9 +132,15 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
                         mod::LLVM.Module, entry::LLVM.Function)
     # tell NVVMReflect whether to flush denormals; this mirrors what Clang does
     # for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key
-    # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`.
-    flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] =
-        Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0)))
+    # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`. only emit it on the
+    # toplevel module that runs through `optimize!`, as sub-modules (the cached
+    # runtime, deferred jobs) don't need it, and the cached runtime in
+    # particular would otherwise conflict on link if it was built with a
+    # different `fastmath` setting (which isn't part of `runtime_slug`).
+    if job.config.toplevel
+        flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] =
+            Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0)))
+    end
 
     # emit the device capability and ptx isa version as constants in the module. this makes
     # it possible to 'query' these in device code, relying on LLVM to optimize the checks