diff --git a/src/optim.jl b/src/optim.jl
index 8ecf4dc0..0bf2d4cd 100644
--- a/src/optim.jl
+++ b/src/optim.jl
@@ -108,30 +108,52 @@ function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob),
         add!(mpm, VerifierPass())
     end
     add!(mpm, ForceFunctionAttrsPass())
-    # TODO invokePipelineStartCallbacks
+    if LLVM.version() >= v"17"
+        add!(mpm, PipelineStartCallbacks(; opt_level))
+    end
     add!(mpm, Annotation2MetadataPass())
+    add!(mpm, InferFunctionAttrsPass())
     add!(mpm, ConstantMergePass())
     add!(mpm, NewPMFunctionPassManager()) do fpm
         add!(fpm, LowerExpectIntrinsicPass())
         if opt_level >= 2
             add!(fpm, PropagateJuliaAddrspacesPass())
         end
+        # DCE must come before simplifycfg: codegen can generate unused
+        # statements that would otherwise alter how simplifycfg optimizes the CFG.
+        add!(fpm, DCEPass())
         add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...))
         if opt_level >= 1
-            add!(fpm, DCEPass())
             add!(fpm, SROAPass())
+            add!(fpm, EarlyCSEPass())
         end
     end
-    # TODO invokeEarlySimplificationCallbacks
+    if opt_level >= 1
+        add!(mpm, GlobalOptPass())
+        add!(mpm, NewPMFunctionPassManager()) do fpm
+            add!(fpm, PromotePass())
+            add!(fpm, instcombine_pass(job))
+        end
+    end
+    if LLVM.version() >= v"17"
+        add!(mpm, PipelineEarlySimplificationCallbacks(; opt_level))
+    end
 end
 
 function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
+    if LLVM.version() >= v"17"
+        add!(mpm, OptimizerEarlyCallbacks(; opt_level))
+    end
     add!(mpm, NewPMCGSCCPassManager()) do cgpm
-        # TODO invokeCGSCCCallbacks
-        add!(cgpm, NewPMFunctionPassManager()) do fpm
-            add!(fpm, AllocOptPass())
-            add!(fpm, Float2IntPass())
-            add!(fpm, LowerConstantIntrinsicsPass())
+        if LLVM.version() >= v"17"
+            add!(cgpm, CGSCCOptimizerLateCallbacks(; opt_level))
+        end
+        if opt_level >= 2
+            add!(cgpm, NewPMFunctionPassManager()) do fpm
+                add!(fpm, AllocOptPass())
+                add!(fpm, Float2IntPass())
+                add!(fpm, LowerConstantIntrinsicsPass())
+            end
         end
     end
     add!(mpm, GPULowerCPUFeaturesPass())
@@ -139,50 +161,63 @@ function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_l
         add!(mpm, NewPMFunctionPassManager()) do fpm
             if opt_level >= 2
                 add!(fpm, SROAPass())
+                add!(fpm, EarlyCSEPass(; memssa=true))
                 add!(fpm, instcombine_pass(job))
+                add!(fpm, AggressiveInstCombinePass())
                 add!(fpm, JumpThreadingPass())
                 add!(fpm, CorrelatedValuePropagationPass())
+                add!(fpm, LibCallsShrinkWrapPass())
                 add!(fpm, ReassociatePass())
-                add!(fpm, EarlyCSEPass())
+                add!(fpm, ConstraintEliminationPass())
                 add!(fpm, AllocOptPass())
             else
-                add!(fpm, instcombine_pass(job))
                 add!(fpm, EarlyCSEPass())
+                add!(fpm, instcombine_pass(job))
+            end
+            if LLVM.version() >= v"17"
+                add!(fpm, PeepholeCallbacks(; opt_level))
             end
         end
-        # TODO invokePeepholeCallbacks
     end
+    add!(mpm, GlobalOptPass())
+    add!(mpm, GlobalDCEPass())
 end
 
 function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
-    add!(fpm, NewPMLoopPassManager()) do lpm
+    add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
         add!(lpm, LowerSIMDLoopPass())
         if opt_level >= 2
+            add!(lpm, LoopInstSimplifyPass())
+            add!(lpm, LoopSimplifyCFGPass())
+            # run LICM with AllowSpeculation=false before rotation to avoid
+            # speculating loads that rotation can hoist more precisely.
+            add!(lpm, LICMPass(; allowspeculation=false))
+            add!(lpm, JuliaLICMPass())
             add!(lpm, LoopRotatePass())
-        end
-        # TODO invokeLateLoopOptimizationCallbacks
-    end
-    if opt_level >= 2
-        add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
             add!(lpm, LICMPass())
             add!(lpm, JuliaLICMPass())
             add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true))
-            add!(lpm, LICMPass())
-            add!(lpm, JuliaLICMPass())
+        end
+        if LLVM.version() >= v"17"
+            add!(lpm, LateLoopOptimizationsCallbacks(; opt_level))
         end
     end
     if opt_level >= 2
         add!(fpm, IRCEPass())
     end
+    add!(fpm, SimplifyCFGPass(; BasicSimplifyCFGOptions...))
+    add!(fpm, instcombine_pass(job))
     add!(fpm, NewPMLoopPassManager()) do lpm
         if opt_level >= 2
-            add!(lpm, LoopInstSimplifyPass())
             add!(lpm, LoopIdiomRecognizePass())
             add!(lpm, IndVarSimplifyPass())
+            add!(lpm, SimpleLoopUnswitchPass(nontrivial=true, trivial=true))
             add!(lpm, LoopDeletionPass())
             add!(lpm, LoopFullUnrollPass())
         end
-        # TODO invokeLoopOptimizerEndCallbacks
+        if LLVM.version() >= v"17"
+            add!(lpm, LoopOptimizerEndCallbacks(; opt_level))
+        end
     end
 end
 
@@ -190,44 +225,84 @@ function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_
     if opt_level >= 2
         add!(fpm, AllocOptPass())
         add!(fpm, SROAPass())
-        add!(fpm, InstSimplifyPass())
+        add!(fpm, VectorCombinePass())
+        add!(fpm, MergedLoadStoreMotionPass())
         add!(fpm, GVNPass())
-        add!(fpm, MemCpyOptPass())
         add!(fpm, SCCPPass())
+        add!(fpm, BDCEPass())
+        add!(fpm, instcombine_pass(job))
         add!(fpm, CorrelatedValuePropagationPass())
-        add!(fpm, DCEPass())
+        add!(fpm, ADCEPass())
+        add!(fpm, MemCpyOptPass())
+        add!(fpm, DSEPass())
         add!(fpm, IRCEPass())
-        add!(fpm, instcombine_pass(job))
         add!(fpm, JumpThreadingPass())
+        add!(fpm, ConstraintEliminationPass())
+    elseif opt_level >= 1
+        add!(fpm, AllocOptPass())
+        add!(fpm, SROAPass())
+        add!(fpm, MemCpyOptPass())
+        add!(fpm, SCCPPass())
+        add!(fpm, BDCEPass())
+        add!(fpm, instcombine_pass(job))
+        add!(fpm, ADCEPass())
     end
     if opt_level >= 3
         add!(fpm, GVNPass())
     end
     if opt_level >= 2
         add!(fpm, DSEPass())
-        # TODO invokePeepholeCallbacks
+        if LLVM.version() >= v"17"
+            add!(fpm, PeepholeCallbacks(; opt_level))
+        end
         add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
         add!(fpm, AllocOptPass())
-        add!(fpm, NewPMLoopPassManager()) do lpm
-            add!(lpm, LoopDeletionPass())
-            add!(lpm, LoopInstSimplifyPass())
+        add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
+            add!(lpm, LICMPass())
+            add!(lpm, JuliaLICMPass())
         end
-        add!(fpm, LoopDistributePass())
+        add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
+        add!(fpm, instcombine_pass(job))
+    elseif opt_level >= 1
+        add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
+    end
+    if LLVM.version() >= v"17"
+        add!(fpm, ScalarOptimizerLateCallbacks(; opt_level))
     end
-    # TODO invokeScalarOptimizerCallbacks
 end
 
 function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level)
+    # re-rotate loops that might have been unrotated in the simplification above
+    add!(fpm, NewPMLoopPassManager()) do lpm
+        add!(lpm, LoopRotatePass())
+        add!(lpm, LoopDeletionPass())
+    end
+    add!(fpm, LoopDistributePass())
     add!(fpm, InjectTLIMappings())
     add!(fpm, LoopVectorizePass())
     add!(fpm, LoopLoadEliminationPass())
-    add!(fpm, instcombine_pass(job))
     add!(fpm, SimplifyCFGPass(; AggressiveSimplifyCFGOptions...))
+    add!(fpm, NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
+        add!(lpm, LICMPass())
+    end
+    add!(fpm, EarlyCSEPass())
+    add!(fpm, CorrelatedValuePropagationPass())
+    add!(fpm, instcombine_pass(job))
     add!(fpm, SLPVectorizerPass())
     add!(fpm, VectorCombinePass())
-    # TODO invokeVectorizerCallbacks
-    add!(fpm, ADCEPass())
+    if LLVM.version() >= v"17"
+        add!(fpm, VectorizerStartCallbacks(; opt_level))
+    end
     add!(fpm, LoopUnrollPass(; opt_level))
+    if LLVM.version() >= v"21"
+        add!(fpm, VectorizerEndCallbacks(; opt_level))
+    end
+    if LLVM.version() >= v"16"
+        add!(fpm, SROAPass(; preserve_cfg=true))
+    else
+        add!(fpm, SROAPass())
+    end
+    add!(fpm, InstSimplifyPass())
 end
 
 function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
@@ -312,13 +387,15 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
             add!(fpm, DivRemPairsPass())
         end
     end
-    # TODO invokeOptimizerLastCallbacks
+    if LLVM.version() >= v"17"
+        add!(mpm, OptimizerLastCallbacks(; opt_level))
+    end
     add!(mpm, NewPMFunctionPassManager()) do fpm
         add!(fpm, AnnotationRemarksPass())
     end
     add!(mpm, NewPMFunctionPassManager()) do fpm
         add!(fpm, DemoteFloat16Pass())
-        if opt_level >= 1
+        if opt_level >= 2
             add!(fpm, GVNPass())
         end
     end
diff --git a/src/ptx.jl b/src/ptx.jl
index 4a977371..a3af659d 100644
--- a/src/ptx.jl
+++ b/src/ptx.jl
@@ -130,6 +130,18 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =
 
 function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
                         mod::LLVM.Module, entry::LLVM.Function)
+    # tell NVVMReflect whether to flush denormals; this mirrors what Clang does
+    # for `-fcuda-flush-denormals-to-zero` and is the only `__nvvm_reflect` key
+    # LLVM's NVVMReflectPass honors besides `__CUDA_ARCH`. only emit it on the
+    # toplevel module that runs through `optimize!`, as sub-modules (the cached
+    # runtime, deferred jobs) don't need it, and the cached runtime in
+    # particular would otherwise conflict on link if it was built with a
+    # different `fastmath` setting (which isn't part of `runtime_slug`).
+    if job.config.toplevel
+        flags(mod)["nvvm-reflect-ftz", LLVM.API.LLVMModuleFlagBehaviorOverride] =
+            Metadata(ConstantInt(Int32(job.config.target.fastmath ? 1 : 0)))
+    end
+
     # emit the device capability and ptx isa version as constants in the module. this makes
     # it possible to 'query' these in device code, relying on LLVM to optimize the checks
     # away and generate static code. note that we only do so if there's actual uses of these
@@ -261,12 +273,17 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
     tm = llvm_machine(job.config.target)
     # TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450)
     @dispose pb=NewPMPassBuilder() begin
-        register!(pb, NVVMReflectPass())
         register!(pb, PTXRSqrtFastPass())
         register!(pb, PTXFDivFastPass())
         register!(pb, PTXFSqrtFastPass())
-
-        add!(pb, NVVMReflectPass())
+        if LLVM.version() < v"17"
+            # Pre-17 LLVM has no way to invoke EP callbacks from the string
+            # API, so fall back to our own nvvm_reflect! implementation.
+            # LLVM 17+ picks up NVPTX's built-in NVVMReflectPass through the
+            # PipelineStart EP invocations woven into `buildNewPMPipeline!`.
+            register!(pb, NVVMReflectPass())
+            add!(pb, NVVMReflectPass())
+        end
         add!(pb, PTXRSqrtFastPass())
         add!(pb, PTXFDivFastPass())
         add!(pb, PTXFSqrtFastPass())
@@ -455,9 +472,12 @@ end
 
 # Replace occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect with an integer.
 #
-# NOTE: this is the same as LLVM's NVVMReflect pass, which we cannot use because it is
-#       not exported. It is meant to be added to a pass pipeline automatically, by
-#       calling adjustPassManager, but we don't use a PassManagerBuilder so cannot do so.
+# This is a back-port of LLVM's NVVMReflectPass for LLVM < 17, where the
+# built-in pass cannot be invoked via the string-API PipelineStart EP callback.
+# Semantics match LLVM's: `__CUDA_ARCH` is derived from the target capability,
+# `__CUDA_FTZ` is read from the `nvvm-reflect-ftz` module flag, and every other
+# key folds to 0. Knobs like denormal flushing or FMAD contraction must be
+# configured through module flags or LLVM fast-math flags, not here.
 const NVVM_REFLECT_FUNCTION = "__nvvm_reflect"
 function nvvm_reflect!(mod::LLVM.Module)
     job = current_job::CompilerJob
@@ -472,6 +492,18 @@ function nvvm_reflect!(mod::LLVM.Module)
     reflect_typ = return_type(function_type(reflect_function))
     isa(reflect_typ, LLVM.IntegerType) || error("_reflect's return type should be integer")
 
+    # pull __CUDA_FTZ from the nvvm-reflect-ftz module flag (same source LLVM uses)
+    ftz_val = 0
+    if haskey(flags(mod), "nvvm-reflect-ftz")
+        flag = flags(mod)["nvvm-reflect-ftz"]
+        if flag isa LLVM.ConstantAsMetadata
+            c = LLVM.Value(flag)
+            if c isa ConstantInt
+                ftz_val = Int(convert(Int64, c))
+            end
+        end
+    end
+
     to_remove = []
     for use in uses(reflect_function)
         call = user(use)
@@ -515,31 +547,14 @@ function nvvm_reflect!(mod::LLVM.Module)
         chars = convert.(Ref(UInt8), collect(sym_op))
         reflect_arg = String(chars[1:end-1])
 
-        # handle possible cases
-        # XXX: put some of these property in the compiler job?
-        #      and/or first set the "nvvm-reflect-*" module flag like Clang does?
-        fast_math = current_job.config.target.fastmath
-        # NOTE: we follow nvcc's --use_fast_math
-        reflect_val = if reflect_arg == "__CUDA_FTZ"
-            # single-precision denormals support
-            ConstantInt(reflect_typ, fast_math ? 1 : 0)
-        elseif reflect_arg == "__CUDA_PREC_DIV"
-            # single-precision floating-point division and reciprocals.
-            ConstantInt(reflect_typ, fast_math ? 0 : 1)
-        elseif reflect_arg == "__CUDA_PREC_SQRT"
-            # single-precision floating point square roots.
-            ConstantInt(reflect_typ, fast_math ? 0 : 1)
-        elseif reflect_arg == "__CUDA_FMAD"
-            # contraction of floating-point multiplies and adds/subtracts into
-            # floating-point multiply-add operations (FMAD, FFMA, or DFMA)
-            ConstantInt(reflect_typ, fast_math ? 1 : 0)
-        elseif reflect_arg == "__CUDA_ARCH"
-            ConstantInt(reflect_typ, job.config.target.cap.major*100 + job.config.target.cap.minor*10)
+        # match LLVM's NVVMReflectPass: unknown keys fold to 0.
+        reflect_val = if reflect_arg == "__CUDA_ARCH"
+            ConstantInt(reflect_typ,
+                        job.config.target.cap.major*100 + job.config.target.cap.minor*10)
+        elseif reflect_arg == "__CUDA_FTZ"
+            ConstantInt(reflect_typ, ftz_val)
         else
-            @safe_error """Unrecognized format of __nvvm_reflect call:
-                           $(string(call))
-                           Unknown argument $reflect_arg. Please file an issue."""
-            continue
+            ConstantInt(reflect_typ, 0)
         end
 
         replace_uses!(call, reflect_val)